diff --git a/sys/net/ifdi_if.m b/sys/net/ifdi_if.m
index 077b19dd7481..3a17f24fdf91 100644
--- a/sys/net/ifdi_if.m
+++ b/sys/net/ifdi_if.m
@@ -1,473 +1,474 @@
 #
 # Copyright (c) 2014-2018, Matthew Macy (mmacy@mattmacy.io)
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 #
 #  1. Redistributions of source code must retain the above copyright notice,
 #     this list of conditions and the following disclaimer.
 #
 #  2. Neither the name of Matthew Macy nor the names of its
 #     contributors may be used to endorse or promote products derived from
 #     this software without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 #
 # $FreeBSD$
 #
 
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 
 #include <machine/bus.h>
 #include <sys/bus.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/iflib.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 
 INTERFACE ifdi;
 
 CODE {
 
 	static void
 	null_void_op(if_ctx_t _ctx __unused)
 	{
 	}
 
 	static int
 	null_knlist_add(if_ctx_t _ctx __unused, struct knote *_kn)
 	{
 	    return (0);
 	}
 
 	static int
 	null_knote_event(if_ctx_t _ctx __unused, struct knote *_kn, int _hint)
 	{
 	    return (0);
 	}
 
 	static void
 	null_timer_op(if_ctx_t _ctx __unused, uint16_t _qsidx __unused)
 	{
 	}
 
 	static int
 	null_int_op(if_ctx_t _ctx __unused)
 	{
 		return (0);
 	}
 
 	static int
 	null_int_int_op(if_ctx_t _ctx __unused, int arg0 __unused)
 	{
 		return (ENOTSUP);
 	}
 
 	static int
 	null_queue_intr_enable(if_ctx_t _ctx __unused, uint16_t _qid __unused)
 	{
 		return (ENOTSUP);
 	}
 
 	static void
 	null_led_func(if_ctx_t _ctx __unused, int _onoff __unused)
 	{
 	}
 
 	static void
 	null_vlan_register_op(if_ctx_t _ctx __unused, uint16_t vtag __unused)
 	{
 	}
 
 	static int
 	null_q_setup(if_ctx_t _ctx __unused, uint32_t _qid __unused)
 	{
 		return (0);
 	}
 
 	static int
 	null_i2c_req(if_ctx_t _sctx __unused, struct ifi2creq *_i2c __unused)
 	{
 		return (ENOTSUP);
 	}
 
 	static int
 	null_sysctl_int_delay(if_ctx_t _sctx __unused, if_int_delay_info_t _iidi __unused)
 	{
 		return (0);
 	}
 
 	static int
 	null_iov_init(if_ctx_t _ctx __unused, uint16_t num_vfs __unused, const nvlist_t *params __unused)
 	{
 		return (ENOTSUP);
 	}
 
 	static int
 	null_vf_add(if_ctx_t _ctx __unused, uint16_t num_vfs __unused, const nvlist_t *params __unused)
 	{
 		return (ENOTSUP);
 	}
 
 	static int
 	null_priv_ioctl(if_ctx_t _ctx __unused, u_long command, caddr_t *data __unused)
 	{
 		return (ENOTSUP);
 	}
 
 	static void
 	null_media_status(if_ctx_t ctx __unused, struct ifmediareq *ifmr)
 	{
 	    ifmr->ifm_status = IFM_AVALID | IFM_ACTIVE;
 	    ifmr->ifm_active = IFM_ETHER | IFM_25G_ACC | IFM_FDX;
 	}
 
 	static int
 	null_cloneattach(if_ctx_t ctx __unused, struct if_clone *ifc __unused,
 			 const char *name __unused, caddr_t params __unused)
 	{
 	    return (0);
 	}
 
 	static void
 	null_rx_clset(if_ctx_t _ctx __unused, uint16_t _flid __unused,
 		      uint16_t _qid __unused, caddr_t *_sdcl __unused)
 	{
 	}
 	static void
 	null_object_info_get(if_ctx_t ctx __unused, void *data __unused, int size __unused)
 	{
 	}
 	static int
 	default_mac_set(if_ctx_t ctx, const uint8_t *mac)
 	{
 	    struct ifnet *ifp = iflib_get_ifp(ctx);
 	    struct sockaddr_dl *sdl;
 
 	    if (ifp && ifp->if_addr) {
 		sdl = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
 		MPASS(sdl->sdl_type == IFT_ETHER);
 		memcpy(LLADDR(sdl), mac, ETHER_ADDR_LEN);
 	    }
 	    return (0);
 	}
 
 	static bool
 	null_needs_restart(if_ctx_t _ctx __unused, enum iflib_restart_event _event __unused)
 	{
 		return (true);
 	}
 };
 
 #
 # kevent interfaces
 #
 
 METHOD int knlist_add {
 	if_ctx_t _ctx;
 	struct knote *_kn;
 } DEFAULT null_knlist_add;
 
 METHOD int knote_event {
 	if_ctx_t _ctx;
 	struct knote *_kn;
 	int hint;
 } DEFAULT null_knote_event;
 
 
 #
 # query
 #
 
 METHOD int object_info_get {
 	if_ctx_t _ctx;
 	void *data;
 	int size;
 } DEFAULT null_object_info_get;
 
 #
 # bus interfaces
 #
 
 METHOD int attach_pre {
 	if_ctx_t _ctx;
 } DEFAULT null_int_op;
 
 METHOD int attach_post {
 	if_ctx_t _ctx;
 } DEFAULT null_int_op;
 
 METHOD int reinit_pre {
 	if_ctx_t _ctx;
 } DEFAULT null_int_op;
 
 METHOD int reinit_post {
 	if_ctx_t _ctx;
 } DEFAULT null_int_op;
 
 METHOD int cloneattach {
 	if_ctx_t _ctx;
 	struct if_clone *_ifc;
 	const char *_name;
 	caddr_t params;
 } DEFAULT null_cloneattach;
 
 METHOD int detach {
 	if_ctx_t _ctx;
 };
 
 METHOD int suspend {
 	if_ctx_t _ctx;
 } DEFAULT null_int_op;
 
 METHOD int shutdown {
 	if_ctx_t _ctx;
 } DEFAULT null_int_op;
 
 METHOD int resume {
 	if_ctx_t _ctx;
 } DEFAULT null_int_op;
 
 #
 # downcall to driver to allocate its
 # own queue state and tie it to the parent
 #
 
 METHOD int tx_queues_alloc {
 	if_ctx_t _ctx;
 	caddr_t *_vaddrs;
 	uint64_t *_paddrs;
 	int ntxqs;
 	int ntxqsets;
 };
 
 METHOD int rx_queues_alloc {
 	if_ctx_t _ctx;
 	caddr_t *_vaddrs;
 	uint64_t *_paddrs;
 	int nrxqs;
 	int nrxqsets;
 };
 
 METHOD void queues_free {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 METHOD void rx_clset {
 	if_ctx_t _ctx;
 	uint16_t _fl;
 	uint16_t _qsetid;
 	caddr_t *_sdcl;
 } DEFAULT null_rx_clset;
 
 #
 # interface reset / stop
 #
 
 METHOD void init {
 	if_ctx_t _ctx;
 };
 
 METHOD void stop {
 	if_ctx_t _ctx;
 };
 
 #
 # interrupt setup and manipulation
 #
 
 METHOD int msix_intr_assign {
 	if_ctx_t _sctx;
 	int msix;
 } DEFAULT null_int_int_op;
 
 METHOD void intr_enable {
 	if_ctx_t _ctx;
 };
 
 METHOD void intr_disable {
 	if_ctx_t _ctx;
 };
 
 METHOD int rx_queue_intr_enable {
 	if_ctx_t _ctx;
 	uint16_t _qid;
 } DEFAULT null_queue_intr_enable;
 
 METHOD int tx_queue_intr_enable {
 	if_ctx_t _ctx;
 	uint16_t _qid;
 } DEFAULT null_queue_intr_enable;
 
 METHOD void link_intr_enable {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 METHOD void admin_completion_handle {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 #
 # interface configuration
 #
 
 METHOD void multi_set {
 	if_ctx_t _ctx;
 };
 
 METHOD int mtu_set {
 	if_ctx_t _ctx;
 	uint32_t _mtu;
 };
 METHOD int mac_set {
 	if_ctx_t _ctx;
 	const uint8_t *_mac;
 } DEFAULT default_mac_set;
 
 METHOD void media_set{
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 METHOD int promisc_set {
 	if_ctx_t _ctx;
 	int _flags;
 };
 
 METHOD void crcstrip_set {
 	if_ctx_t _ctx;
 	int _onoff;
 	int _strip;
 };
 
 #
 # IOV handling
 #
 
 METHOD void vflr_handle {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 METHOD int iov_init {
 	if_ctx_t _ctx;
 	uint16_t num_vfs;
 	const nvlist_t * params;
 } DEFAULT null_iov_init;
 
 METHOD void iov_uninit {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 METHOD int iov_vf_add {
 	if_ctx_t _ctx;
 	uint16_t num_vfs;
 	const nvlist_t * params;
 } DEFAULT null_vf_add;
 
 
 #
 # Device status
 #
 
 METHOD void update_admin_status {
 	if_ctx_t _ctx;
 };
 
 METHOD void media_status {
 	if_ctx_t _ctx;
 	struct ifmediareq *_ifm;
 } DEFAULT null_media_status;
 
 METHOD int media_change {
 	if_ctx_t _ctx;
 } DEFAULT null_int_op;
 
 METHOD uint64_t get_counter {
 	if_ctx_t _ctx;
 	ift_counter cnt;
 };
 
 METHOD int priv_ioctl {
 	if_ctx_t _ctx;
 	u_long   _cmd;
 	caddr_t _data;
 } DEFAULT null_priv_ioctl;
 
 #
 # optional methods
 #
 
 METHOD int i2c_req {
 	if_ctx_t _ctx;
 	struct ifi2creq *_req;
 } DEFAULT null_i2c_req;
 
 METHOD int txq_setup {
 	if_ctx_t _ctx;
 	uint32_t _txqid;
 } DEFAULT null_q_setup;
 
 METHOD int rxq_setup {
 	if_ctx_t _ctx;
 	uint32_t _txqid;
 } DEFAULT null_q_setup;
 
 METHOD void timer {
 	if_ctx_t _ctx;
 	uint16_t _txqid;
 } DEFAULT null_timer_op;
 
 METHOD void watchdog_reset {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 METHOD void watchdog_reset_queue {
 	if_ctx_t _ctx;
 	uint16_t _q;
 } DEFAULT null_timer_op;
 
 METHOD void led_func {
 	if_ctx_t _ctx;
 	int _onoff;
 } DEFAULT null_led_func;
 
 METHOD void vlan_register {
 	if_ctx_t _ctx;
 	uint16_t _vtag;
 } DEFAULT null_vlan_register_op;
 
 METHOD void vlan_unregister {
 	if_ctx_t _ctx;
 	uint16_t _vtag;
 } DEFAULT null_vlan_register_op;
 
 METHOD int sysctl_int_delay {
 	if_ctx_t _sctx;
 	if_int_delay_info_t _iidi;
 } DEFAULT null_sysctl_int_delay;
 
 METHOD void debug {
 	if_ctx_t _ctx;
 } DEFAULT null_void_op;
 
 METHOD bool needs_restart {
 	if_ctx_t _ctx;
 	enum iflib_restart_event _event;
 } DEFAULT null_needs_restart;
diff --git a/sys/net/ifq.c b/sys/net/ifq.c
index 0e3159a75d48..c09d962e206a 100644
--- a/sys/net/ifq.c
+++ b/sys/net/ifq.c
@@ -1,176 +1,177 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)if.h	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/param.h>
 #include <sys/socket.h>
 
 #ifndef ALTQ
 #define	ALTQ	/* Needed for ifq.h prototypes only. */
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/ifq.h>
 
 int
 drbr_enqueue(struct ifnet *ifp, struct buf_ring *br, struct mbuf *m)
 {
 	int error = 0;
 
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		IFQ_ENQUEUE(&ifp->if_snd, m, error);
 		if (error)
 			if_inc_counter((ifp), IFCOUNTER_OQDROPS, 1);
 		return (error);
 	}
 	error = buf_ring_enqueue(br, m);
 	if (error)
 		m_freem(m);
 
 	return (error);
 }
 
 void
 drbr_putback(struct ifnet *ifp, struct buf_ring *br, struct mbuf *m_new)
 {
 	/*
 	 * The top of the list needs to be swapped
 	 * for this one.
 	 */
 	if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		/*
 		 * Peek in altq case dequeued it
 		 * so put it back.
 		 */
 		IFQ_DRV_PREPEND(&ifp->if_snd, m_new);
 		return;
 	}
 	buf_ring_putback_sc(br, m_new);
 }
 
 struct mbuf *
 drbr_peek(struct ifnet *ifp, struct buf_ring *br)
 {
 	struct mbuf *m;
 	if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		/*
 		 * Pull it off like a dequeue
 		 * since drbr_advance() does nothing
 		 * for altq and drbr_putback() will
 		 * use the old prepend function.
 		 */
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		return (m);
 	}
 	return ((struct mbuf *)buf_ring_peek_clear_sc(br));
 }
 
 void
 drbr_flush(struct ifnet *ifp, struct buf_ring *br)
 {
 	struct mbuf *m;
 
 	if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd))
 		IFQ_PURGE(&ifp->if_snd);
 	while ((m = (struct mbuf *)buf_ring_dequeue_sc(br)) != NULL)
 		m_freem(m);
 }
 
 struct mbuf *
 drbr_dequeue(struct ifnet *ifp, struct buf_ring *br)
 {
 	struct mbuf *m;
 
 	if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		return (m);
 	}
 	return ((struct mbuf *)buf_ring_dequeue_sc(br));
 }
 
 void
 drbr_advance(struct ifnet *ifp, struct buf_ring *br)
 {
 	/* Nothing to do here since peek dequeues in altq case */
 	if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd))
 		return;
 	return (buf_ring_advance_sc(br));
 }
 
 struct mbuf *
 drbr_dequeue_cond(struct ifnet *ifp, struct buf_ring *br,
     int (*func) (struct mbuf *, void *), void *arg)
 {
 	struct mbuf *m;
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		IFQ_LOCK(&ifp->if_snd);
 		IFQ_POLL_NOLOCK(&ifp->if_snd, m);
 		if (m != NULL && func(m, arg) == 0) {
 			IFQ_UNLOCK(&ifp->if_snd);
 			return (NULL);
 		}
 		IFQ_DEQUEUE_NOLOCK(&ifp->if_snd, m);
 		IFQ_UNLOCK(&ifp->if_snd);
 		return (m);
 	}
 	m = (struct mbuf *)buf_ring_peek(br);
 	if (m == NULL || func(m, arg) == 0)
 		return (NULL);
 
 	return ((struct mbuf *)buf_ring_dequeue_sc(br));
 }
 
 int
 drbr_empty(struct ifnet *ifp, struct buf_ring *br)
 {
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		return (IFQ_IS_EMPTY(&ifp->if_snd));
 	return (buf_ring_empty(br));
 }
 
 int
 drbr_needs_enqueue(struct ifnet *ifp, struct buf_ring *br)
 {
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		return (1);
 	return (!buf_ring_empty(br));
 }
 
 int
 drbr_inuse(struct ifnet *ifp, struct buf_ring *br)
 {
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		return (ifp->if_snd.ifq_len);
 	return (buf_ring_count(br));
 }
 
diff --git a/sys/net80211/ieee80211.c b/sys/net80211/ieee80211.c
index 1034088c6fb6..fbf3e1ca01c8 100644
--- a/sys/net80211/ieee80211.c
+++ b/sys/net80211/ieee80211.c
@@ -1,2650 +1,2651 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001 Atsushi Onoe
  * Copyright (c) 2002-2009 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * IEEE 802.11 generic handler
  */
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sbuf.h>
 
 #include <machine/stdarg.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/ethernet.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_regdomain.h>
 #ifdef IEEE80211_SUPPORT_SUPERG
 #include <net80211/ieee80211_superg.h>
 #endif
 #include <net80211/ieee80211_ratectl.h>
 #include <net80211/ieee80211_vht.h>
 
 #include <net/bpf.h>
 
 const char *ieee80211_phymode_name[IEEE80211_MODE_MAX] = {
 	[IEEE80211_MODE_AUTO]	  = "auto",
 	[IEEE80211_MODE_11A]	  = "11a",
 	[IEEE80211_MODE_11B]	  = "11b",
 	[IEEE80211_MODE_11G]	  = "11g",
 	[IEEE80211_MODE_FH]	  = "FH",
 	[IEEE80211_MODE_TURBO_A]  = "turboA",
 	[IEEE80211_MODE_TURBO_G]  = "turboG",
 	[IEEE80211_MODE_STURBO_A] = "sturboA",
 	[IEEE80211_MODE_HALF]	  = "half",
 	[IEEE80211_MODE_QUARTER]  = "quarter",
 	[IEEE80211_MODE_11NA]	  = "11na",
 	[IEEE80211_MODE_11NG]	  = "11ng",
 	[IEEE80211_MODE_VHT_2GHZ]	  = "11acg",
 	[IEEE80211_MODE_VHT_5GHZ]	  = "11ac",
 };
 /* map ieee80211_opmode to the corresponding capability bit */
 const int ieee80211_opcap[IEEE80211_OPMODE_MAX] = {
 	[IEEE80211_M_IBSS]	= IEEE80211_C_IBSS,
 	[IEEE80211_M_WDS]	= IEEE80211_C_WDS,
 	[IEEE80211_M_STA]	= IEEE80211_C_STA,
 	[IEEE80211_M_AHDEMO]	= IEEE80211_C_AHDEMO,
 	[IEEE80211_M_HOSTAP]	= IEEE80211_C_HOSTAP,
 	[IEEE80211_M_MONITOR]	= IEEE80211_C_MONITOR,
 #ifdef IEEE80211_SUPPORT_MESH
 	[IEEE80211_M_MBSS]	= IEEE80211_C_MBSS,
 #endif
 };
 
 const uint8_t ieee80211broadcastaddr[IEEE80211_ADDR_LEN] =
 	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static	void ieee80211_syncflag_locked(struct ieee80211com *ic, int flag);
 static	void ieee80211_syncflag_ht_locked(struct ieee80211com *ic, int flag);
 static	void ieee80211_syncflag_ext_locked(struct ieee80211com *ic, int flag);
 static	void ieee80211_syncflag_vht_locked(struct ieee80211com *ic, int flag);
 static	int ieee80211_media_setup(struct ieee80211com *ic,
 		struct ifmedia *media, int caps, int addsta,
 		ifm_change_cb_t media_change, ifm_stat_cb_t media_stat);
 static	int media_status(enum ieee80211_opmode,
 		const struct ieee80211_channel *);
 static uint64_t ieee80211_get_counter(struct ifnet *, ift_counter);
 
 MALLOC_DEFINE(M_80211_VAP, "80211vap", "802.11 vap state");
 
 /*
  * Default supported rates for 802.11 operation (in IEEE .5Mb units).
  */
 #define	B(r)	((r) | IEEE80211_RATE_BASIC)
 static const struct ieee80211_rateset ieee80211_rateset_11a =
 	{ 8, { B(12), 18, B(24), 36, B(48), 72, 96, 108 } };
 static const struct ieee80211_rateset ieee80211_rateset_half =
 	{ 8, { B(6), 9, B(12), 18, B(24), 36, 48, 54 } };
 static const struct ieee80211_rateset ieee80211_rateset_quarter =
 	{ 8, { B(3), 4, B(6), 9, B(12), 18, 24, 27 } };
 static const struct ieee80211_rateset ieee80211_rateset_11b =
 	{ 4, { B(2), B(4), B(11), B(22) } };
 /* NB: OFDM rates are handled specially based on mode */
 static const struct ieee80211_rateset ieee80211_rateset_11g =
 	{ 12, { B(2), B(4), B(11), B(22), 12, 18, 24, 36, 48, 72, 96, 108 } };
 #undef B
 
 static int set_vht_extchan(struct ieee80211_channel *c);
 
 /*
  * Fill in 802.11 available channel set, mark
  * all available channels as active, and pick
  * a default channel if not already specified.
  */
 void
 ieee80211_chan_init(struct ieee80211com *ic)
 {
 #define	DEFAULTRATES(m, def) do { \
 	if (ic->ic_sup_rates[m].rs_nrates == 0) \
 		ic->ic_sup_rates[m] = def; \
 } while (0)
 	struct ieee80211_channel *c;
 	int i;
 
 	KASSERT(0 < ic->ic_nchans && ic->ic_nchans <= IEEE80211_CHAN_MAX,
 		("invalid number of channels specified: %u", ic->ic_nchans));
 	memset(ic->ic_chan_avail, 0, sizeof(ic->ic_chan_avail));
 	memset(ic->ic_modecaps, 0, sizeof(ic->ic_modecaps));
 	setbit(ic->ic_modecaps, IEEE80211_MODE_AUTO);
 	for (i = 0; i < ic->ic_nchans; i++) {
 		c = &ic->ic_channels[i];
 		KASSERT(c->ic_flags != 0, ("channel with no flags"));
 		/*
 		 * Help drivers that work only with frequencies by filling
 		 * in IEEE channel #'s if not already calculated.  Note this
 		 * mimics similar work done in ieee80211_setregdomain when
 		 * changing regulatory state.
 		 */
 		if (c->ic_ieee == 0)
 			c->ic_ieee = ieee80211_mhz2ieee(c->ic_freq,c->ic_flags);
 
 		/*
 		 * Setup the HT40/VHT40 upper/lower bits.
 		 * The VHT80/... math is done elsewhere.
 		 */
 		if (IEEE80211_IS_CHAN_HT40(c) && c->ic_extieee == 0)
 			c->ic_extieee = ieee80211_mhz2ieee(c->ic_freq +
 			    (IEEE80211_IS_CHAN_HT40U(c) ? 20 : -20),
 			    c->ic_flags);
 
 		/* Update VHT math */
 		/*
 		 * XXX VHT again, note that this assumes VHT80/... channels
 		 * are legit already.
 		 */
 		set_vht_extchan(c);
 
 		/* default max tx power to max regulatory */
 		if (c->ic_maxpower == 0)
 			c->ic_maxpower = 2*c->ic_maxregpower;
 		setbit(ic->ic_chan_avail, c->ic_ieee);
 		/*
 		 * Identify mode capabilities.
 		 */
 		if (IEEE80211_IS_CHAN_A(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_11A);
 		if (IEEE80211_IS_CHAN_B(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_11B);
 		if (IEEE80211_IS_CHAN_ANYG(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_11G);
 		if (IEEE80211_IS_CHAN_FHSS(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_FH);
 		if (IEEE80211_IS_CHAN_108A(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_TURBO_A);
 		if (IEEE80211_IS_CHAN_108G(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_TURBO_G);
 		if (IEEE80211_IS_CHAN_ST(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_STURBO_A);
 		if (IEEE80211_IS_CHAN_HALF(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_HALF);
 		if (IEEE80211_IS_CHAN_QUARTER(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_QUARTER);
 		if (IEEE80211_IS_CHAN_HTA(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_11NA);
 		if (IEEE80211_IS_CHAN_HTG(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_11NG);
 		if (IEEE80211_IS_CHAN_VHTA(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_VHT_5GHZ);
 		if (IEEE80211_IS_CHAN_VHTG(c))
 			setbit(ic->ic_modecaps, IEEE80211_MODE_VHT_2GHZ);
 	}
 	/* initialize candidate channels to all available */
 	memcpy(ic->ic_chan_active, ic->ic_chan_avail,
 		sizeof(ic->ic_chan_avail));
 
 	/* sort channel table to allow lookup optimizations */
 	ieee80211_sort_channels(ic->ic_channels, ic->ic_nchans);
 
 	/* invalidate any previous state */
 	ic->ic_bsschan = IEEE80211_CHAN_ANYC;
 	ic->ic_prevchan = NULL;
 	ic->ic_csa_newchan = NULL;
 	/* arbitrarily pick the first channel */
 	ic->ic_curchan = &ic->ic_channels[0];
 	ic->ic_rt = ieee80211_get_ratetable(ic->ic_curchan);
 
 	/* fillin well-known rate sets if driver has not specified */
 	DEFAULTRATES(IEEE80211_MODE_11B,	 ieee80211_rateset_11b);
 	DEFAULTRATES(IEEE80211_MODE_11G,	 ieee80211_rateset_11g);
 	DEFAULTRATES(IEEE80211_MODE_11A,	 ieee80211_rateset_11a);
 	DEFAULTRATES(IEEE80211_MODE_TURBO_A,	 ieee80211_rateset_11a);
 	DEFAULTRATES(IEEE80211_MODE_TURBO_G,	 ieee80211_rateset_11g);
 	DEFAULTRATES(IEEE80211_MODE_STURBO_A,	 ieee80211_rateset_11a);
 	DEFAULTRATES(IEEE80211_MODE_HALF,	 ieee80211_rateset_half);
 	DEFAULTRATES(IEEE80211_MODE_QUARTER,	 ieee80211_rateset_quarter);
 	DEFAULTRATES(IEEE80211_MODE_11NA,	 ieee80211_rateset_11a);
 	DEFAULTRATES(IEEE80211_MODE_11NG,	 ieee80211_rateset_11g);
 	DEFAULTRATES(IEEE80211_MODE_VHT_2GHZ,	 ieee80211_rateset_11g);
 	DEFAULTRATES(IEEE80211_MODE_VHT_5GHZ,	 ieee80211_rateset_11a);
 
 	/*
 	 * Setup required information to fill the mcsset field, if driver did
 	 * not. Assume a 2T2R setup for historic reasons.
 	 */
 	if (ic->ic_rxstream == 0)
 		ic->ic_rxstream = 2;
 	if (ic->ic_txstream == 0)
 		ic->ic_txstream = 2;
 
 	ieee80211_init_suphtrates(ic);
 
 	/*
 	 * Set auto mode to reset active channel state and any desired channel.
 	 */
 	(void) ieee80211_setmode(ic, IEEE80211_MODE_AUTO);
 #undef DEFAULTRATES
 }
 
 static void
 null_update_mcast(struct ieee80211com *ic)
 {
 
 	ic_printf(ic, "need multicast update callback\n");
 }
 
 static void
 null_update_promisc(struct ieee80211com *ic)
 {
 
 	ic_printf(ic, "need promiscuous mode update callback\n");
 }
 
 static void
 null_update_chw(struct ieee80211com *ic)
 {
 
 	ic_printf(ic, "%s: need callback\n", __func__);
 }
 
 int
 ic_printf(struct ieee80211com *ic, const char * fmt, ...)
 {
 	va_list ap;
 	int retval;
 
 	retval = printf("%s: ", ic->ic_name);
 	va_start(ap, fmt);
 	retval += vprintf(fmt, ap);
 	va_end(ap);
 	return (retval);
 }
 
 static LIST_HEAD(, ieee80211com) ic_head = LIST_HEAD_INITIALIZER(ic_head);
 static struct mtx ic_list_mtx;
 MTX_SYSINIT(ic_list, &ic_list_mtx, "ieee80211com list", MTX_DEF);
 
 static int
 sysctl_ieee80211coms(SYSCTL_HANDLER_ARGS)
 {
 	struct ieee80211com *ic;
 	struct sbuf sb;
 	char *sp;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 	sbuf_new_for_sysctl(&sb, NULL, 8, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	sp = "";
 	mtx_lock(&ic_list_mtx);
 	LIST_FOREACH(ic, &ic_head, ic_next) {
 		sbuf_printf(&sb, "%s%s", sp, ic->ic_name);
 		sp = " ";
 	}
 	mtx_unlock(&ic_list_mtx);
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error);
 }
 
 SYSCTL_PROC(_net_wlan, OID_AUTO, devices,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_ieee80211coms, "A", "names of available 802.11 devices");
 
 /*
  * Attach/setup the common net80211 state.  Called by
  * the driver on attach to prior to creating any vap's.
  */
 void
 ieee80211_ifattach(struct ieee80211com *ic)
 {
 
 	IEEE80211_LOCK_INIT(ic, ic->ic_name);
 	IEEE80211_TX_LOCK_INIT(ic, ic->ic_name);
 	TAILQ_INIT(&ic->ic_vaps);
 
 	/* Create a taskqueue for all state changes */
 	ic->ic_tq = taskqueue_create("ic_taskq",
 	    IEEE80211_M_WAITOK | IEEE80211_M_ZERO,
 	    taskqueue_thread_enqueue, &ic->ic_tq);
 	taskqueue_start_threads(&ic->ic_tq, 1, PI_NET, "%s net80211 taskq",
 	    ic->ic_name);
 	ic->ic_ierrors = counter_u64_alloc(IEEE80211_M_WAITOK);
 	ic->ic_oerrors = counter_u64_alloc(IEEE80211_M_WAITOK);
 	/*
 	 * Fill in 802.11 available channel set, mark all
 	 * available channels as active, and pick a default
 	 * channel if not already specified.
 	 */
 	ieee80211_chan_init(ic);
 
 	ic->ic_update_mcast = null_update_mcast;
 	ic->ic_update_promisc = null_update_promisc;
 	ic->ic_update_chw = null_update_chw;
 
 	ic->ic_hash_key = arc4random();
 	ic->ic_bintval = IEEE80211_BINTVAL_DEFAULT;
 	ic->ic_lintval = ic->ic_bintval;
 	ic->ic_txpowlimit = IEEE80211_TXPOWER_MAX;
 
 	ieee80211_crypto_attach(ic);
 	ieee80211_node_attach(ic);
 	ieee80211_power_attach(ic);
 	ieee80211_proto_attach(ic);
 #ifdef IEEE80211_SUPPORT_SUPERG
 	ieee80211_superg_attach(ic);
 #endif
 	ieee80211_ht_attach(ic);
 	ieee80211_vht_attach(ic);
 	ieee80211_scan_attach(ic);
 	ieee80211_regdomain_attach(ic);
 	ieee80211_dfs_attach(ic);
 
 	ieee80211_sysctl_attach(ic);
 
 	mtx_lock(&ic_list_mtx);
 	LIST_INSERT_HEAD(&ic_head, ic, ic_next);
 	mtx_unlock(&ic_list_mtx);
 }
 
 /*
  * Detach net80211 state on device detach.  Tear down
  * all vap's and reclaim all common state prior to the
  * device state going away.  Note we may call back into
  * driver; it must be prepared for this.
  */
 void
 ieee80211_ifdetach(struct ieee80211com *ic)
 {
 	struct ieee80211vap *vap;
 
 	/*
 	 * We use this as an indicator that ifattach never had a chance to be
 	 * called, e.g. early driver attach failed and ifdetach was called
 	 * during subsequent detach.  Never fear, for we have nothing to do
 	 * here.
 	 */
 	if (ic->ic_tq == NULL)
 		return;
 
 	mtx_lock(&ic_list_mtx);
 	LIST_REMOVE(ic, ic_next);
 	mtx_unlock(&ic_list_mtx);
 
 	taskqueue_drain(taskqueue_thread, &ic->ic_restart_task);
 
 	/*
 	 * The VAP is responsible for setting and clearing
 	 * the VIMAGE context.
 	 */
 	while ((vap = TAILQ_FIRST(&ic->ic_vaps)) != NULL) {
 		ieee80211_com_vdetach(vap);
 		ieee80211_vap_destroy(vap);
 	}
 	ieee80211_waitfor_parent(ic);
 
 	ieee80211_sysctl_detach(ic);
 	ieee80211_dfs_detach(ic);
 	ieee80211_regdomain_detach(ic);
 	ieee80211_scan_detach(ic);
 #ifdef IEEE80211_SUPPORT_SUPERG
 	ieee80211_superg_detach(ic);
 #endif
 	ieee80211_vht_detach(ic);
 	ieee80211_ht_detach(ic);
 	/* NB: must be called before ieee80211_node_detach */
 	ieee80211_proto_detach(ic);
 	ieee80211_crypto_detach(ic);
 	ieee80211_power_detach(ic);
 	ieee80211_node_detach(ic);
 
 	counter_u64_free(ic->ic_ierrors);
 	counter_u64_free(ic->ic_oerrors);
 
 	taskqueue_free(ic->ic_tq);
 	IEEE80211_TX_LOCK_DESTROY(ic);
 	IEEE80211_LOCK_DESTROY(ic);
 }
 
 struct ieee80211com *
 ieee80211_find_com(const char *name)
 {
 	struct ieee80211com *ic;
 
 	mtx_lock(&ic_list_mtx);
 	LIST_FOREACH(ic, &ic_head, ic_next)
 		if (strcmp(ic->ic_name, name) == 0)
 			break;
 	mtx_unlock(&ic_list_mtx);
 
 	return (ic);
 }
 
 void
 ieee80211_iterate_coms(ieee80211_com_iter_func *f, void *arg)
 {
 	struct ieee80211com *ic;
 
 	mtx_lock(&ic_list_mtx);
 	LIST_FOREACH(ic, &ic_head, ic_next)
 		(*f)(arg, ic);
 	mtx_unlock(&ic_list_mtx);
 }
 
 /*
  * Default reset method for use with the ioctl support.  This
  * method is invoked after any state change in the 802.11
  * layer that should be propagated to the hardware but not
  * require re-initialization of the 802.11 state machine (e.g
  * rescanning for an ap).  We always return ENETRESET which
  * should cause the driver to re-initialize the device. Drivers
  * can override this method to implement more optimized support.
  */
 static int
 default_reset(struct ieee80211vap *vap, u_long cmd)
 {
 	return ENETRESET;
 }
 
 /*
  * Default for updating the VAP default TX key index.
  *
  * Drivers that support TX offload as well as hardware encryption offload
  * may need to be informed of key index changes separate from the key
  * update.
  */
 static void
 default_update_deftxkey(struct ieee80211vap *vap, ieee80211_keyix kid)
 {
 
 	/* XXX assert validity */
 	/* XXX assert we're in a key update block */
 	vap->iv_def_txkey = kid;
 }
 
 /*
  * Add underlying device errors to vap errors.
  */
 static uint64_t
 ieee80211_get_counter(struct ifnet *ifp, ift_counter cnt)
 {
 	struct ieee80211vap *vap = ifp->if_softc;
 	struct ieee80211com *ic = vap->iv_ic;
 	uint64_t rv;
 
 	rv = if_get_counter_default(ifp, cnt);
 	switch (cnt) {
 	case IFCOUNTER_OERRORS:
 		rv += counter_u64_fetch(ic->ic_oerrors);
 		break;
 	case IFCOUNTER_IERRORS:
 		rv += counter_u64_fetch(ic->ic_ierrors);
 		break;
 	default:
 		break;
 	}
 
 	return (rv);
 }
 
 /*
  * Prepare a vap for use.  Drivers use this call to
  * setup net80211 state in new vap's prior attaching
  * them with ieee80211_vap_attach (below).
  */
 int
 ieee80211_vap_setup(struct ieee80211com *ic, struct ieee80211vap *vap,
     const char name[IFNAMSIZ], int unit, enum ieee80211_opmode opmode,
     int flags, const uint8_t bssid[IEEE80211_ADDR_LEN])
 {
 	struct ifnet *ifp;
 
 	ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		ic_printf(ic, "%s: unable to allocate ifnet\n", __func__);
 		return ENOMEM;
 	}
 	if_initname(ifp, name, unit);
 	ifp->if_softc = vap;			/* back pointer */
 	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
 	ifp->if_transmit = ieee80211_vap_transmit;
 	ifp->if_qflush = ieee80211_vap_qflush;
 	ifp->if_ioctl = ieee80211_ioctl;
 	ifp->if_init = ieee80211_init;
 	ifp->if_get_counter = ieee80211_get_counter;
 
 	vap->iv_ifp = ifp;
 	vap->iv_ic = ic;
 	vap->iv_flags = ic->ic_flags;		/* propagate common flags */
 	vap->iv_flags_ext = ic->ic_flags_ext;
 	vap->iv_flags_ven = ic->ic_flags_ven;
 	vap->iv_caps = ic->ic_caps &~ IEEE80211_C_OPMODE;
 
 	/* 11n capabilities - XXX methodize */
 	vap->iv_htcaps = ic->ic_htcaps;
 	vap->iv_htextcaps = ic->ic_htextcaps;
 
 	/* 11ac capabilities - XXX methodize */
 	vap->iv_vhtcaps = ic->ic_vhtcaps;
 	vap->iv_vhtextcaps = ic->ic_vhtextcaps;
 
 	vap->iv_opmode = opmode;
 	vap->iv_caps |= ieee80211_opcap[opmode];
 	IEEE80211_ADDR_COPY(vap->iv_myaddr, ic->ic_macaddr);
 	switch (opmode) {
 	case IEEE80211_M_WDS:
 		/*
 		 * WDS links must specify the bssid of the far end.
 		 * For legacy operation this is a static relationship.
 		 * For non-legacy operation the station must associate
 		 * and be authorized to pass traffic.  Plumbing the
 		 * vap to the proper node happens when the vap
 		 * transitions to RUN state.
 		 */
 		IEEE80211_ADDR_COPY(vap->iv_des_bssid, bssid);
 		vap->iv_flags |= IEEE80211_F_DESBSSID;
 		if (flags & IEEE80211_CLONE_WDSLEGACY)
 			vap->iv_flags_ext |= IEEE80211_FEXT_WDSLEGACY;
 		break;
 #ifdef IEEE80211_SUPPORT_TDMA
 	case IEEE80211_M_AHDEMO:
 		if (flags & IEEE80211_CLONE_TDMA) {
 			/* NB: checked before clone operation allowed */
 			KASSERT(ic->ic_caps & IEEE80211_C_TDMA,
 			    ("not TDMA capable, ic_caps 0x%x", ic->ic_caps));
 			/*
 			 * Propagate TDMA capability to mark vap; this
 			 * cannot be removed and is used to distinguish
 			 * regular ahdemo operation from ahdemo+tdma.
 			 */
 			vap->iv_caps |= IEEE80211_C_TDMA;
 		}
 		break;
 #endif
 	default:
 		break;
 	}
 	/* auto-enable s/w beacon miss support */
 	if (flags & IEEE80211_CLONE_NOBEACONS)
 		vap->iv_flags_ext |= IEEE80211_FEXT_SWBMISS;
 	/* auto-generated or user supplied MAC address */
 	if (flags & (IEEE80211_CLONE_BSSID|IEEE80211_CLONE_MACADDR))
 		vap->iv_flags_ext |= IEEE80211_FEXT_UNIQMAC;
 	/*
 	 * Enable various functionality by default if we're
 	 * capable; the driver can override us if it knows better.
 	 */
 	if (vap->iv_caps & IEEE80211_C_WME)
 		vap->iv_flags |= IEEE80211_F_WME;
 	if (vap->iv_caps & IEEE80211_C_BURST)
 		vap->iv_flags |= IEEE80211_F_BURST;
 	/* NB: bg scanning only makes sense for station mode right now */
 	if (vap->iv_opmode == IEEE80211_M_STA &&
 	    (vap->iv_caps & IEEE80211_C_BGSCAN))
 		vap->iv_flags |= IEEE80211_F_BGSCAN;
 	vap->iv_flags |= IEEE80211_F_DOTH;	/* XXX no cap, just ena */
 	/* NB: DFS support only makes sense for ap mode right now */
 	if (vap->iv_opmode == IEEE80211_M_HOSTAP &&
 	    (vap->iv_caps & IEEE80211_C_DFS))
 		vap->iv_flags_ext |= IEEE80211_FEXT_DFS;
 	/* NB: only flip on U-APSD for hostap/sta for now */
 	if ((vap->iv_opmode == IEEE80211_M_STA)
 	    || (vap->iv_opmode == IEEE80211_M_HOSTAP)) {
 		if (vap->iv_caps & IEEE80211_C_UAPSD)
 			vap->iv_flags_ext |= IEEE80211_FEXT_UAPSD;
 	}
 
 	vap->iv_des_chan = IEEE80211_CHAN_ANYC;		/* any channel is ok */
 	vap->iv_bmissthreshold = IEEE80211_HWBMISS_DEFAULT;
 	vap->iv_dtim_period = IEEE80211_DTIM_DEFAULT;
 	/*
 	 * Install a default reset method for the ioctl support;
 	 * the driver can override this.
 	 */
 	vap->iv_reset = default_reset;
 
 	/*
 	 * Install a default crypto key update method, the driver
 	 * can override this.
 	 */
 	vap->iv_update_deftxkey = default_update_deftxkey;
 
 	ieee80211_sysctl_vattach(vap);
 	ieee80211_crypto_vattach(vap);
 	ieee80211_node_vattach(vap);
 	ieee80211_power_vattach(vap);
 	ieee80211_proto_vattach(vap);
 #ifdef IEEE80211_SUPPORT_SUPERG
 	ieee80211_superg_vattach(vap);
 #endif
 	ieee80211_ht_vattach(vap);
 	ieee80211_vht_vattach(vap);
 	ieee80211_scan_vattach(vap);
 	ieee80211_regdomain_vattach(vap);
 	ieee80211_radiotap_vattach(vap);
 	ieee80211_vap_reset_erp(vap);
 	ieee80211_ratectl_set(vap, IEEE80211_RATECTL_NONE);
 
 	return 0;
 }
 
 /*
  * Activate a vap.  State should have been prepared with a
  * call to ieee80211_vap_setup and by the driver.  On return
  * from this call the vap is ready for use.
  */
 int
 ieee80211_vap_attach(struct ieee80211vap *vap, ifm_change_cb_t media_change,
     ifm_stat_cb_t media_stat, const uint8_t macaddr[IEEE80211_ADDR_LEN])
 {
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ifmediareq imr;
 	int maxrate;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 	    "%s: %s parent %s flags 0x%x flags_ext 0x%x\n",
 	    __func__, ieee80211_opmode_name[vap->iv_opmode],
 	    ic->ic_name, vap->iv_flags, vap->iv_flags_ext);
 
 	/*
 	 * Do late attach work that cannot happen until after
 	 * the driver has had a chance to override defaults.
 	 */
 	ieee80211_node_latevattach(vap);
 	ieee80211_power_latevattach(vap);
 
 	maxrate = ieee80211_media_setup(ic, &vap->iv_media, vap->iv_caps,
 	    vap->iv_opmode == IEEE80211_M_STA, media_change, media_stat);
 	ieee80211_media_status(ifp, &imr);
 	/* NB: strip explicit mode; we're actually in autoselect */
 	ifmedia_set(&vap->iv_media,
 	    imr.ifm_active &~ (IFM_MMASK | IFM_IEEE80211_TURBO));
 	if (maxrate)
 		ifp->if_baudrate = IF_Mbps(maxrate);
 
 	ether_ifattach(ifp, macaddr);
 	IEEE80211_ADDR_COPY(vap->iv_myaddr, IF_LLADDR(ifp));
 	/* hook output method setup by ether_ifattach */
 	vap->iv_output = ifp->if_output;
 	ifp->if_output = ieee80211_output;
 	/* NB: if_mtu set by ether_ifattach to ETHERMTU */
 
 	IEEE80211_LOCK(ic);
 	TAILQ_INSERT_TAIL(&ic->ic_vaps, vap, iv_next);
 	ieee80211_syncflag_locked(ic, IEEE80211_F_WME);
 #ifdef IEEE80211_SUPPORT_SUPERG
 	ieee80211_syncflag_locked(ic, IEEE80211_F_TURBOP);
 #endif
 	ieee80211_syncflag_locked(ic, IEEE80211_F_PCF);
 	ieee80211_syncflag_locked(ic, IEEE80211_F_BURST);
 	ieee80211_syncflag_ht_locked(ic, IEEE80211_FHT_HT);
 	ieee80211_syncflag_ht_locked(ic, IEEE80211_FHT_USEHT40);
 
 	ieee80211_syncflag_vht_locked(ic, IEEE80211_FVHT_VHT);
 	ieee80211_syncflag_vht_locked(ic, IEEE80211_FVHT_USEVHT40);
 	ieee80211_syncflag_vht_locked(ic, IEEE80211_FVHT_USEVHT80);
 	ieee80211_syncflag_vht_locked(ic, IEEE80211_FVHT_USEVHT160);
 	ieee80211_syncflag_vht_locked(ic, IEEE80211_FVHT_USEVHT80P80);
 	IEEE80211_UNLOCK(ic);
 
 	return 1;
 }
 
 /*
  * Tear down vap state and reclaim the ifnet.
  * The driver is assumed to have prepared for
  * this; e.g. by turning off interrupts for the
  * underlying device.
  */
 void
 ieee80211_vap_detach(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	CURVNET_SET(ifp->if_vnet);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s parent %s\n",
 	    __func__, ieee80211_opmode_name[vap->iv_opmode], ic->ic_name);
 
 	/* NB: bpfdetach is called by ether_ifdetach and claims all taps */
 	ether_ifdetach(ifp);
 
 	ieee80211_stop(vap);
 
 	/*
 	 * Flush any deferred vap tasks.
 	 */
 	ieee80211_draintask(ic, &vap->iv_nstate_task);
 	ieee80211_draintask(ic, &vap->iv_swbmiss_task);
 	ieee80211_draintask(ic, &vap->iv_wme_task);
 	ieee80211_draintask(ic, &ic->ic_parent_task);
 
 	/* XXX band-aid until ifnet handles this for us */
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 
 	IEEE80211_LOCK(ic);
 	KASSERT(vap->iv_state == IEEE80211_S_INIT , ("vap still running"));
 	TAILQ_REMOVE(&ic->ic_vaps, vap, iv_next);
 	ieee80211_syncflag_locked(ic, IEEE80211_F_WME);
 #ifdef IEEE80211_SUPPORT_SUPERG
 	ieee80211_syncflag_locked(ic, IEEE80211_F_TURBOP);
 #endif
 	ieee80211_syncflag_locked(ic, IEEE80211_F_PCF);
 	ieee80211_syncflag_locked(ic, IEEE80211_F_BURST);
 	ieee80211_syncflag_ht_locked(ic, IEEE80211_FHT_HT);
 	ieee80211_syncflag_ht_locked(ic, IEEE80211_FHT_USEHT40);
 
 	ieee80211_syncflag_vht_locked(ic, IEEE80211_FVHT_VHT);
 	ieee80211_syncflag_vht_locked(ic, IEEE80211_FVHT_USEVHT40);
 	ieee80211_syncflag_vht_locked(ic, IEEE80211_FVHT_USEVHT80);
 	ieee80211_syncflag_vht_locked(ic, IEEE80211_FVHT_USEVHT160);
 	ieee80211_syncflag_vht_locked(ic, IEEE80211_FVHT_USEVHT80P80);
 
 	/* NB: this handles the bpfdetach done below */
 	ieee80211_syncflag_ext_locked(ic, IEEE80211_FEXT_BPF);
 	if (vap->iv_ifflags & IFF_PROMISC)
 		ieee80211_promisc(vap, false);
 	if (vap->iv_ifflags & IFF_ALLMULTI)
 		ieee80211_allmulti(vap, false);
 	IEEE80211_UNLOCK(ic);
 
 	ifmedia_removeall(&vap->iv_media);
 
 	ieee80211_radiotap_vdetach(vap);
 	ieee80211_regdomain_vdetach(vap);
 	ieee80211_scan_vdetach(vap);
 #ifdef IEEE80211_SUPPORT_SUPERG
 	ieee80211_superg_vdetach(vap);
 #endif
 	ieee80211_vht_vdetach(vap);
 	ieee80211_ht_vdetach(vap);
 	/* NB: must be before ieee80211_node_vdetach */
 	ieee80211_proto_vdetach(vap);
 	ieee80211_crypto_vdetach(vap);
 	ieee80211_power_vdetach(vap);
 	ieee80211_node_vdetach(vap);
 	ieee80211_sysctl_vdetach(vap);
 
 	if_free(ifp);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Count number of vaps in promisc, and issue promisc on
  * parent respectively.
  */
 void
 ieee80211_promisc(struct ieee80211vap *vap, bool on)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	if (on) {
 		if (++ic->ic_promisc == 1)
 			ieee80211_runtask(ic, &ic->ic_promisc_task);
 	} else {
 		KASSERT(ic->ic_promisc > 0, ("%s: ic %p not promisc",
 		    __func__, ic));
 		if (--ic->ic_promisc == 0)
 			ieee80211_runtask(ic, &ic->ic_promisc_task);
 	}
 }
 
 /*
  * Count number of vaps in allmulti, and issue allmulti on
  * parent respectively.
  */
 void
 ieee80211_allmulti(struct ieee80211vap *vap, bool on)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	if (on) {
 		if (++ic->ic_allmulti == 1)
 			ieee80211_runtask(ic, &ic->ic_mcast_task);
 	} else {
 		KASSERT(ic->ic_allmulti > 0, ("%s: ic %p not allmulti",
 		    __func__, ic));
 		if (--ic->ic_allmulti == 0)
 			ieee80211_runtask(ic, &ic->ic_mcast_task);
 	}
 }
 
 /*
  * Synchronize flag bit state in the com structure
  * according to the state of all vap's.  This is used,
  * for example, to handle state changes via ioctls.
  */
 static void
 ieee80211_syncflag_locked(struct ieee80211com *ic, int flag)
 {
 	struct ieee80211vap *vap;
 	int bit;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	bit = 0;
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
 		if (vap->iv_flags & flag) {
 			bit = 1;
 			break;
 		}
 	if (bit)
 		ic->ic_flags |= flag;
 	else
 		ic->ic_flags &= ~flag;
 }
 
 void
 ieee80211_syncflag(struct ieee80211vap *vap, int flag)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK(ic);
 	if (flag < 0) {
 		flag = -flag;
 		vap->iv_flags &= ~flag;
 	} else
 		vap->iv_flags |= flag;
 	ieee80211_syncflag_locked(ic, flag);
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Synchronize flags_ht bit state in the com structure
  * according to the state of all vap's.  This is used,
  * for example, to handle state changes via ioctls.
  */
 static void
 ieee80211_syncflag_ht_locked(struct ieee80211com *ic, int flag)
 {
 	struct ieee80211vap *vap;
 	int bit;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	bit = 0;
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
 		if (vap->iv_flags_ht & flag) {
 			bit = 1;
 			break;
 		}
 	if (bit)
 		ic->ic_flags_ht |= flag;
 	else
 		ic->ic_flags_ht &= ~flag;
 }
 
 void
 ieee80211_syncflag_ht(struct ieee80211vap *vap, int flag)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK(ic);
 	if (flag < 0) {
 		flag = -flag;
 		vap->iv_flags_ht &= ~flag;
 	} else
 		vap->iv_flags_ht |= flag;
 	ieee80211_syncflag_ht_locked(ic, flag);
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Synchronize flags_vht bit state in the com structure
  * according to the state of all vap's.  This is used,
  * for example, to handle state changes via ioctls.
  */
 static void
 ieee80211_syncflag_vht_locked(struct ieee80211com *ic, int flag)
 {
 	struct ieee80211vap *vap;
 	int bit;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	bit = 0;
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
 		if (vap->iv_flags_vht & flag) {
 			bit = 1;
 			break;
 		}
 	if (bit)
 		ic->ic_flags_vht |= flag;
 	else
 		ic->ic_flags_vht &= ~flag;
 }
 
 void
 ieee80211_syncflag_vht(struct ieee80211vap *vap, int flag)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK(ic);
 	if (flag < 0) {
 		flag = -flag;
 		vap->iv_flags_vht &= ~flag;
 	} else
 		vap->iv_flags_vht |= flag;
 	ieee80211_syncflag_vht_locked(ic, flag);
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Synchronize flags_ext bit state in the com structure
  * according to the state of all vap's.  This is used,
  * for example, to handle state changes via ioctls.
  */
 static void
 ieee80211_syncflag_ext_locked(struct ieee80211com *ic, int flag)
 {
 	struct ieee80211vap *vap;
 	int bit;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	bit = 0;
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
 		if (vap->iv_flags_ext & flag) {
 			bit = 1;
 			break;
 		}
 	if (bit)
 		ic->ic_flags_ext |= flag;
 	else
 		ic->ic_flags_ext &= ~flag;
 }
 
 void
 ieee80211_syncflag_ext(struct ieee80211vap *vap, int flag)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK(ic);
 	if (flag < 0) {
 		flag = -flag;
 		vap->iv_flags_ext &= ~flag;
 	} else
 		vap->iv_flags_ext |= flag;
 	ieee80211_syncflag_ext_locked(ic, flag);
 	IEEE80211_UNLOCK(ic);
 }
 
 static __inline int
 mapgsm(u_int freq, u_int flags)
 {
 	freq *= 10;
 	if (flags & IEEE80211_CHAN_QUARTER)
 		freq += 5;
 	else if (flags & IEEE80211_CHAN_HALF)
 		freq += 10;
 	else
 		freq += 20;
 	/* NB: there is no 907/20 wide but leave room */
 	return (freq - 906*10) / 5;
 }
 
 static __inline int
 mappsb(u_int freq, u_int flags)
 {
 	return 37 + ((freq * 10) + ((freq % 5) == 2 ? 5 : 0) - 49400) / 5;
 }
 
 /*
  * Convert MHz frequency to IEEE channel number.
  */
 int
 ieee80211_mhz2ieee(u_int freq, u_int flags)
 {
 #define	IS_FREQ_IN_PSB(_freq) ((_freq) > 4940 && (_freq) < 4990)
 	if (flags & IEEE80211_CHAN_GSM)
 		return mapgsm(freq, flags);
 	if (flags & IEEE80211_CHAN_2GHZ) {	/* 2GHz band */
 		if (freq == 2484)
 			return 14;
 		if (freq < 2484)
 			return ((int) freq - 2407) / 5;
 		else
 			return 15 + ((freq - 2512) / 20);
 	} else if (flags & IEEE80211_CHAN_5GHZ) {	/* 5Ghz band */
 		if (freq <= 5000) {
 			/* XXX check regdomain? */
 			if (IS_FREQ_IN_PSB(freq))
 				return mappsb(freq, flags);
 			return (freq - 4000) / 5;
 		} else
 			return (freq - 5000) / 5;
 	} else {				/* either, guess */
 		if (freq == 2484)
 			return 14;
 		if (freq < 2484) {
 			if (907 <= freq && freq <= 922)
 				return mapgsm(freq, flags);
 			return ((int) freq - 2407) / 5;
 		}
 		if (freq < 5000) {
 			if (IS_FREQ_IN_PSB(freq))
 				return mappsb(freq, flags);
 			else if (freq > 4900)
 				return (freq - 4000) / 5;
 			else
 				return 15 + ((freq - 2512) / 20);
 		}
 		return (freq - 5000) / 5;
 	}
 #undef IS_FREQ_IN_PSB
 }
 
 /*
  * Convert channel to IEEE channel number.
  */
 int
 ieee80211_chan2ieee(struct ieee80211com *ic, const struct ieee80211_channel *c)
 {
 	if (c == NULL) {
 		ic_printf(ic, "invalid channel (NULL)\n");
 		return 0;		/* XXX */
 	}
 	return (c == IEEE80211_CHAN_ANYC ?  IEEE80211_CHAN_ANY : c->ic_ieee);
 }
 
 /*
  * Convert IEEE channel number to MHz frequency.
  */
 u_int
 ieee80211_ieee2mhz(u_int chan, u_int flags)
 {
 	if (flags & IEEE80211_CHAN_GSM)
 		return 907 + 5 * (chan / 10);
 	if (flags & IEEE80211_CHAN_2GHZ) {	/* 2GHz band */
 		if (chan == 14)
 			return 2484;
 		if (chan < 14)
 			return 2407 + chan*5;
 		else
 			return 2512 + ((chan-15)*20);
 	} else if (flags & IEEE80211_CHAN_5GHZ) {/* 5Ghz band */
 		if (flags & (IEEE80211_CHAN_HALF|IEEE80211_CHAN_QUARTER)) {
 			chan -= 37;
 			return 4940 + chan*5 + (chan % 5 ? 2 : 0);
 		}
 		return 5000 + (chan*5);
 	} else {				/* either, guess */
 		/* XXX can't distinguish PSB+GSM channels */
 		if (chan == 14)
 			return 2484;
 		if (chan < 14)			/* 0-13 */
 			return 2407 + chan*5;
 		if (chan < 27)			/* 15-26 */
 			return 2512 + ((chan-15)*20);
 		return 5000 + (chan*5);
 	}
 }
 
 static __inline void
 set_extchan(struct ieee80211_channel *c)
 {
 
 	/*
 	 * IEEE Std 802.11-2012, page 1738, subclause 20.3.15.4:
 	 * "the secondary channel number shall be 'N + [1,-1] * 4'
 	 */
 	if (c->ic_flags & IEEE80211_CHAN_HT40U)
 		c->ic_extieee = c->ic_ieee + 4;
 	else if (c->ic_flags & IEEE80211_CHAN_HT40D)
 		c->ic_extieee = c->ic_ieee - 4;
 	else
 		c->ic_extieee = 0;
 }
 
 /*
  * Populate the freq1/freq2 fields as appropriate for VHT channels.
  *
  * This for now uses a hard-coded list of 80MHz wide channels.
  *
  * For HT20/HT40, freq1 just is the centre frequency of the 40MHz
  * wide channel we've already decided upon.
  *
  * For VHT80 and VHT160, there are only a small number of fixed
  * 80/160MHz wide channels, so we just use those.
  *
  * This is all likely very very wrong - both the regulatory code
  * and this code needs to ensure that all four channels are
  * available and valid before the VHT80 (and eight for VHT160) channel
  * is created.
  */
 
 struct vht_chan_range {
 	uint16_t freq_start;
 	uint16_t freq_end;
 };
 
 struct vht_chan_range vht80_chan_ranges[] = {
 	{ 5170, 5250 },
 	{ 5250, 5330 },
 	{ 5490, 5570 },
 	{ 5570, 5650 },
 	{ 5650, 5730 },
 	{ 5735, 5815 },
 	{ 0, 0 }
 };
 
 struct vht_chan_range vht160_chan_ranges[] = {
 	{ 5170, 5330 },
 	{ 5490, 5650 },
 	{ 0, 0 }
 };
 
 static int
 set_vht_extchan(struct ieee80211_channel *c)
 {
 	int i;
 
 	if (! IEEE80211_IS_CHAN_VHT(c))
 		return (0);
 
 	if (IEEE80211_IS_CHAN_VHT80P80(c)) {
 		printf("%s: TODO VHT80+80 channel (ieee=%d, flags=0x%08x)\n",
 		    __func__, c->ic_ieee, c->ic_flags);
 	}
 
 	if (IEEE80211_IS_CHAN_VHT160(c)) {
 		for (i = 0; vht160_chan_ranges[i].freq_start != 0; i++) {
 			if (c->ic_freq >= vht160_chan_ranges[i].freq_start &&
 			    c->ic_freq < vht160_chan_ranges[i].freq_end) {
 				int midpoint;
 
 				midpoint = vht160_chan_ranges[i].freq_start + 80;
 				c->ic_vht_ch_freq1 =
 				    ieee80211_mhz2ieee(midpoint, c->ic_flags);
 				c->ic_vht_ch_freq2 = 0;
 #if 0
 				printf("%s: %d, freq=%d, midpoint=%d, freq1=%d, freq2=%d\n",
 				    __func__, c->ic_ieee, c->ic_freq, midpoint,
 				    c->ic_vht_ch_freq1, c->ic_vht_ch_freq2);
 #endif
 				return (1);
 			}
 		}
 		return (0);
 	}
 
 	if (IEEE80211_IS_CHAN_VHT80(c)) {
 		for (i = 0; vht80_chan_ranges[i].freq_start != 0; i++) {
 			if (c->ic_freq >= vht80_chan_ranges[i].freq_start &&
 			    c->ic_freq < vht80_chan_ranges[i].freq_end) {
 				int midpoint;
 
 				midpoint = vht80_chan_ranges[i].freq_start + 40;
 				c->ic_vht_ch_freq1 =
 				    ieee80211_mhz2ieee(midpoint, c->ic_flags);
 				c->ic_vht_ch_freq2 = 0;
 #if 0
 				printf("%s: %d, freq=%d, midpoint=%d, freq1=%d, freq2=%d\n",
 				    __func__, c->ic_ieee, c->ic_freq, midpoint,
 				    c->ic_vht_ch_freq1, c->ic_vht_ch_freq2);
 #endif
 				return (1);
 			}
 		}
 		return (0);
 	}
 
 	if (IEEE80211_IS_CHAN_VHT40(c)) {
 		if (IEEE80211_IS_CHAN_HT40U(c))
 			c->ic_vht_ch_freq1 = c->ic_ieee + 2;
 		else if (IEEE80211_IS_CHAN_HT40D(c))
 			c->ic_vht_ch_freq1 = c->ic_ieee - 2;
 		else
 			return (0);
 		return (1);
 	}
 
 	if (IEEE80211_IS_CHAN_VHT20(c)) {
 		c->ic_vht_ch_freq1 = c->ic_ieee;
 		return (1);
 	}
 
 	printf("%s: unknown VHT channel type (ieee=%d, flags=0x%08x)\n",
 	    __func__, c->ic_ieee, c->ic_flags);
 
 	return (0);
 }
 
 /*
  * Return whether the current channel could possibly be a part of
  * a VHT80/VHT160 channel.
  *
  * This doesn't check that the whole range is in the allowed list
  * according to regulatory.
  */
 static bool
 is_vht160_valid_freq(uint16_t freq)
 {
 	int i;
 
 	for (i = 0; vht160_chan_ranges[i].freq_start != 0; i++) {
 		if (freq >= vht160_chan_ranges[i].freq_start &&
 		    freq < vht160_chan_ranges[i].freq_end)
 			return (true);
 	}
 	return (false);
 }
 
 static int
 is_vht80_valid_freq(uint16_t freq)
 {
 	int i;
 	for (i = 0; vht80_chan_ranges[i].freq_start != 0; i++) {
 		if (freq >= vht80_chan_ranges[i].freq_start &&
 		    freq < vht80_chan_ranges[i].freq_end)
 			return (1);
 	}
 	return (0);
 }
 
 static int
 addchan(struct ieee80211_channel chans[], int maxchans, int *nchans,
     uint8_t ieee, uint16_t freq, int8_t maxregpower, uint32_t flags)
 {
 	struct ieee80211_channel *c;
 
 	if (*nchans >= maxchans)
 		return (ENOBUFS);
 
 #if 0
 	printf("%s: %d of %d: ieee=%d, freq=%d, flags=0x%08x\n",
 	    __func__, *nchans, maxchans, ieee, freq, flags);
 #endif
 
 	c = &chans[(*nchans)++];
 	c->ic_ieee = ieee;
 	c->ic_freq = freq != 0 ? freq : ieee80211_ieee2mhz(ieee, flags);
 	c->ic_maxregpower = maxregpower;
 	c->ic_maxpower = 2 * maxregpower;
 	c->ic_flags = flags;
 	c->ic_vht_ch_freq1 = 0;
 	c->ic_vht_ch_freq2 = 0;
 	set_extchan(c);
 	set_vht_extchan(c);
 
 	return (0);
 }
 
 static int
 copychan_prev(struct ieee80211_channel chans[], int maxchans, int *nchans,
     uint32_t flags)
 {
 	struct ieee80211_channel *c;
 
 	KASSERT(*nchans > 0, ("channel list is empty\n"));
 
 	if (*nchans >= maxchans)
 		return (ENOBUFS);
 
 #if 0
 	printf("%s: %d of %d: flags=0x%08x\n",
 	    __func__, *nchans, maxchans, flags);
 #endif
 
 	c = &chans[(*nchans)++];
 	c[0] = c[-1];
 	c->ic_flags = flags;
 	c->ic_vht_ch_freq1 = 0;
 	c->ic_vht_ch_freq2 = 0;
 	set_extchan(c);
 	set_vht_extchan(c);
 
 	return (0);
 }
 
 /*
  * XXX VHT-2GHz
  */
 static void
 getflags_2ghz(const uint8_t bands[], uint32_t flags[], int cbw_flags)
 {
 	int nmodes;
 
 	nmodes = 0;
 	if (isset(bands, IEEE80211_MODE_11B))
 		flags[nmodes++] = IEEE80211_CHAN_B;
 	if (isset(bands, IEEE80211_MODE_11G))
 		flags[nmodes++] = IEEE80211_CHAN_G;
 	if (isset(bands, IEEE80211_MODE_11NG))
 		flags[nmodes++] = IEEE80211_CHAN_G | IEEE80211_CHAN_HT20;
 	if (cbw_flags & NET80211_CBW_FLAG_HT40) {
 		flags[nmodes++] = IEEE80211_CHAN_G | IEEE80211_CHAN_HT40U;
 		flags[nmodes++] = IEEE80211_CHAN_G | IEEE80211_CHAN_HT40D;
 	}
 	flags[nmodes] = 0;
 }
 
 static void
 getflags_5ghz(const uint8_t bands[], uint32_t flags[], int cbw_flags)
 {
 	int nmodes;
 
 	/*
 	 * The addchan_list() function seems to expect the flags array to
 	 * be in channel width order, so the VHT bits are interspersed
 	 * as appropriate to maintain said order.
 	 *
 	 * It also assumes HT40U is before HT40D.
 	 */
 	nmodes = 0;
 
 	/* 20MHz */
 	if (isset(bands, IEEE80211_MODE_11A))
 		flags[nmodes++] = IEEE80211_CHAN_A;
 	if (isset(bands, IEEE80211_MODE_11NA))
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT20;
 	if (isset(bands, IEEE80211_MODE_VHT_5GHZ)) {
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT20 |
 		    IEEE80211_CHAN_VHT20;
 	}
 
 	/* 40MHz */
 	if (cbw_flags & NET80211_CBW_FLAG_HT40)
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT40U;
 	if ((cbw_flags & NET80211_CBW_FLAG_HT40) &&
 	    isset(bands, IEEE80211_MODE_VHT_5GHZ))
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT40U |
 		    IEEE80211_CHAN_VHT40U;
 	if (cbw_flags & NET80211_CBW_FLAG_HT40)
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT40D;
 	if ((cbw_flags & NET80211_CBW_FLAG_HT40) &&
 	    isset(bands, IEEE80211_MODE_VHT_5GHZ))
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT40D |
 		    IEEE80211_CHAN_VHT40D;
 
 	/* 80MHz */
 	if ((cbw_flags & NET80211_CBW_FLAG_VHT80) &&
 	    isset(bands, IEEE80211_MODE_VHT_5GHZ)) {
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT40U |
 		    IEEE80211_CHAN_VHT80;
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT40D |
 		    IEEE80211_CHAN_VHT80;
 	}
 
 	/* VHT160 */
 	if ((cbw_flags & NET80211_CBW_FLAG_VHT160) &&
 	    isset(bands, IEEE80211_MODE_VHT_5GHZ)) {
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT40U |
 		    IEEE80211_CHAN_VHT160;
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT40D |
 		    IEEE80211_CHAN_VHT160;
 	}
 
 	/* VHT80+80 */
 	if ((cbw_flags & NET80211_CBW_FLAG_VHT80P80) &&
 	    isset(bands, IEEE80211_MODE_VHT_5GHZ)) {
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT40U |
 		    IEEE80211_CHAN_VHT80P80;
 		flags[nmodes++] = IEEE80211_CHAN_A | IEEE80211_CHAN_HT40D |
 		    IEEE80211_CHAN_VHT80P80;
 	}
 
 	flags[nmodes] = 0;
 }
 
 static void
 getflags(const uint8_t bands[], uint32_t flags[], int cbw_flags)
 {
 
 	flags[0] = 0;
 	if (isset(bands, IEEE80211_MODE_11A) ||
 	    isset(bands, IEEE80211_MODE_11NA) ||
 	    isset(bands, IEEE80211_MODE_VHT_5GHZ)) {
 		if (isset(bands, IEEE80211_MODE_11B) ||
 		    isset(bands, IEEE80211_MODE_11G) ||
 		    isset(bands, IEEE80211_MODE_11NG) ||
 		    isset(bands, IEEE80211_MODE_VHT_2GHZ))
 			return;
 
 		getflags_5ghz(bands, flags, cbw_flags);
 	} else
 		getflags_2ghz(bands, flags, cbw_flags);
 }
 
 /*
  * Add one 20 MHz channel into specified channel list.
  * You MUST NOT mix bands when calling this.  It will not add 5ghz
  * channels if you have any B/G/N band bit set.
  * The _cbw() variant does also support HT40/VHT80/160/80+80.
  */
 int
 ieee80211_add_channel_cbw(struct ieee80211_channel chans[], int maxchans,
     int *nchans, uint8_t ieee, uint16_t freq, int8_t maxregpower,
     uint32_t chan_flags, const uint8_t bands[], int cbw_flags)
 {
 	uint32_t flags[IEEE80211_MODE_MAX];
 	int i, error;
 
 	getflags(bands, flags, cbw_flags);
 	KASSERT(flags[0] != 0, ("%s: no correct mode provided\n", __func__));
 
 	error = addchan(chans, maxchans, nchans, ieee, freq, maxregpower,
 	    flags[0] | chan_flags);
 	for (i = 1; flags[i] != 0 && error == 0; i++) {
 		error = copychan_prev(chans, maxchans, nchans,
 		    flags[i] | chan_flags);
 	}
 
 	return (error);
 }
 
 int
 ieee80211_add_channel(struct ieee80211_channel chans[], int maxchans,
     int *nchans, uint8_t ieee, uint16_t freq, int8_t maxregpower,
     uint32_t chan_flags, const uint8_t bands[])
 {
 
 	return (ieee80211_add_channel_cbw(chans, maxchans, nchans, ieee, freq,
 	    maxregpower, chan_flags, bands, 0));
 }
 
 static struct ieee80211_channel *
 findchannel(struct ieee80211_channel chans[], int nchans, uint16_t freq,
     uint32_t flags)
 {
 	struct ieee80211_channel *c;
 	int i;
 
 	flags &= IEEE80211_CHAN_ALLTURBO;
 	/* brute force search */
 	for (i = 0; i < nchans; i++) {
 		c = &chans[i];
 		if (c->ic_freq == freq &&
 		    (c->ic_flags & IEEE80211_CHAN_ALLTURBO) == flags)
 			return c;
 	}
 	return NULL;
 }
 
 /*
  * Add 40 MHz channel pair into specified channel list.
  */
 /* XXX VHT */
 int
 ieee80211_add_channel_ht40(struct ieee80211_channel chans[], int maxchans,
     int *nchans, uint8_t ieee, int8_t maxregpower, uint32_t flags)
 {
 	struct ieee80211_channel *cent, *extc;
 	uint16_t freq;
 	int error;
 
 	freq = ieee80211_ieee2mhz(ieee, flags);
 
 	/*
 	 * Each entry defines an HT40 channel pair; find the
 	 * center channel, then the extension channel above.
 	 */
 	flags |= IEEE80211_CHAN_HT20;
 	cent = findchannel(chans, *nchans, freq, flags);
 	if (cent == NULL)
 		return (EINVAL);
 
 	extc = findchannel(chans, *nchans, freq + 20, flags);
 	if (extc == NULL)
 		return (ENOENT);
 
 	flags &= ~IEEE80211_CHAN_HT;
 	error = addchan(chans, maxchans, nchans, cent->ic_ieee, cent->ic_freq,
 	    maxregpower, flags | IEEE80211_CHAN_HT40U);
 	if (error != 0)
 		return (error);
 
 	error = addchan(chans, maxchans, nchans, extc->ic_ieee, extc->ic_freq,
 	    maxregpower, flags | IEEE80211_CHAN_HT40D);
 
 	return (error);
 }
 
 /*
  * Fetch the center frequency for the primary channel.
  */
 uint32_t
 ieee80211_get_channel_center_freq(const struct ieee80211_channel *c)
 {
 
 	return (c->ic_freq);
 }
 
 /*
  * Fetch the center frequency for the primary BAND channel.
  *
  * For 5, 10, 20MHz channels it'll be the normally configured channel
  * frequency.
  *
  * For 40MHz, 80MHz, 160MHz channels it will be the centre of the
  * wide channel, not the centre of the primary channel (that's ic_freq).
  *
  * For 80+80MHz channels this will be the centre of the primary
  * 80MHz channel; the secondary 80MHz channel will be center_freq2().
  */
 uint32_t
 ieee80211_get_channel_center_freq1(const struct ieee80211_channel *c)
 {
 
 	/*
 	 * VHT - use the pre-calculated centre frequency
 	 * of the given channel.
 	 */
 	if (IEEE80211_IS_CHAN_VHT(c))
 		return (ieee80211_ieee2mhz(c->ic_vht_ch_freq1, c->ic_flags));
 
 	if (IEEE80211_IS_CHAN_HT40U(c)) {
 		return (c->ic_freq + 10);
 	}
 	if (IEEE80211_IS_CHAN_HT40D(c)) {
 		return (c->ic_freq - 10);
 	}
 
 	return (c->ic_freq);
 }
 
 /*
  * For now, no 80+80 support; it will likely always return 0.
  */
 uint32_t
 ieee80211_get_channel_center_freq2(const struct ieee80211_channel *c)
 {
 
 	if (IEEE80211_IS_CHAN_VHT(c) && (c->ic_vht_ch_freq2 != 0))
 		return (ieee80211_ieee2mhz(c->ic_vht_ch_freq2, c->ic_flags));
 
 	return (0);
 }
 
 /*
  * Adds channels into specified channel list (ieee[] array must be sorted).
  * Channels are already sorted.
  */
 static int
 add_chanlist(struct ieee80211_channel chans[], int maxchans, int *nchans,
     const uint8_t ieee[], int nieee, uint32_t flags[])
 {
 	uint16_t freq;
 	int i, j, error;
 	int is_vht;
 
 	for (i = 0; i < nieee; i++) {
 		freq = ieee80211_ieee2mhz(ieee[i], flags[0]);
 		for (j = 0; flags[j] != 0; j++) {
 			/*
 			 * Notes:
 			 * + HT40 and VHT40 channels occur together, so
 			 *   we need to be careful that we actually allow that.
 			 * + VHT80, VHT160 will coexist with HT40/VHT40, so
 			 *   make sure it's not skipped because of the overlap
 			 *   check used for (V)HT40.
 			 */
 			is_vht = !! (flags[j] & IEEE80211_CHAN_VHT);
 
 			/* XXX TODO FIXME VHT80P80. */
 
 			/* Test for VHT160 analogue to the VHT80 below. */
 			if (is_vht && flags[j] & IEEE80211_CHAN_VHT160)
 				if (! is_vht160_valid_freq(freq))
 					continue;
 
 			/*
 			 * Test for VHT80.
 			 * XXX This is all very broken right now.
 			 * What we /should/ do is:
 			 *
 			 * + check that the frequency is in the list of
 			 *   allowed VHT80 ranges; and
 			 * + the other 3 channels in the list are actually
 			 *   also available.
 			 */
 			if (is_vht && flags[j] & IEEE80211_CHAN_VHT80)
 				if (! is_vht80_valid_freq(freq))
 					continue;
 
 			/*
 			 * Test for (V)HT40.
 			 *
 			 * This is also a fall through from VHT80; as we only
 			 * allow a VHT80 channel if the VHT40 combination is
 			 * also valid.  If the VHT40 form is not valid then
 			 * we certainly can't do VHT80..
 			 */
 			if (flags[j] & IEEE80211_CHAN_HT40D)
 				/*
 				 * Can't have a "lower" channel if we are the
 				 * first channel.
 				 *
 				 * Can't have a "lower" channel if it's below/
 				 * within 20MHz of the first channel.
 				 *
 				 * Can't have a "lower" channel if the channel
 				 * below it is not 20MHz away.
 				 */
 				if (i == 0 || ieee[i] < ieee[0] + 4 ||
 				    freq - 20 !=
 				    ieee80211_ieee2mhz(ieee[i] - 4, flags[j]))
 					continue;
 			if (flags[j] & IEEE80211_CHAN_HT40U)
 				/*
 				 * Can't have an "upper" channel if we are
 				 * the last channel.
 				 *
 				 * Can't have an "upper" channel be above the
 				 * last channel in the list.
 				 *
 				 * Can't have an "upper" channel if the next
 				 * channel according to the math isn't 20MHz
 				 * away.  (Likely for channel 13/14.)
 				 */
 				if (i == nieee - 1 ||
 				    ieee[i] + 4 > ieee[nieee - 1] ||
 				    freq + 20 !=
 				    ieee80211_ieee2mhz(ieee[i] + 4, flags[j]))
 					continue;
 
 			if (j == 0) {
 				error = addchan(chans, maxchans, nchans,
 				    ieee[i], freq, 0, flags[j]);
 			} else {
 				error = copychan_prev(chans, maxchans, nchans,
 				    flags[j]);
 			}
 			if (error != 0)
 				return (error);
 		}
 	}
 
 	return (0);
 }
 
 int
 ieee80211_add_channel_list_2ghz(struct ieee80211_channel chans[], int maxchans,
     int *nchans, const uint8_t ieee[], int nieee, const uint8_t bands[],
     int cbw_flags)
 {
 	uint32_t flags[IEEE80211_MODE_MAX];
 
 	/* XXX no VHT for now */
 	getflags_2ghz(bands, flags, cbw_flags);
 	KASSERT(flags[0] != 0, ("%s: no correct mode provided\n", __func__));
 
 	return (add_chanlist(chans, maxchans, nchans, ieee, nieee, flags));
 }
 
 int
 ieee80211_add_channels_default_2ghz(struct ieee80211_channel chans[],
     int maxchans, int *nchans, const uint8_t bands[], int cbw_flags)
 {
 	const uint8_t default_chan_list[] =
 	    { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
 
 	return (ieee80211_add_channel_list_2ghz(chans, maxchans, nchans,
 	    default_chan_list, nitems(default_chan_list), bands, cbw_flags));
 }
 
 int
 ieee80211_add_channel_list_5ghz(struct ieee80211_channel chans[], int maxchans,
     int *nchans, const uint8_t ieee[], int nieee, const uint8_t bands[],
     int cbw_flags)
 {
 	/*
 	 * XXX-BZ with HT and VHT there is no 1:1 mapping anymore.  Review all
 	 * uses of IEEE80211_MODE_MAX and add a new #define name for array size.
 	 */
 	uint32_t flags[2 * IEEE80211_MODE_MAX];
 
 	getflags_5ghz(bands, flags, cbw_flags);
 	KASSERT(flags[0] != 0, ("%s: no correct mode provided\n", __func__));
 
 	return (add_chanlist(chans, maxchans, nchans, ieee, nieee, flags));
 }
 
 /*
  * Locate a channel given a frequency+flags.  We cache
  * the previous lookup to optimize switching between two
  * channels--as happens with dynamic turbo.
  */
 struct ieee80211_channel *
 ieee80211_find_channel(struct ieee80211com *ic, int freq, int flags)
 {
 	struct ieee80211_channel *c;
 
 	flags &= IEEE80211_CHAN_ALLTURBO;
 	c = ic->ic_prevchan;
 	if (c != NULL && c->ic_freq == freq &&
 	    (c->ic_flags & IEEE80211_CHAN_ALLTURBO) == flags)
 		return c;
 	/* brute force search */
 	return (findchannel(ic->ic_channels, ic->ic_nchans, freq, flags));
 }
 
 /*
  * Locate a channel given a channel number+flags.  We cache
  * the previous lookup to optimize switching between two
  * channels--as happens with dynamic turbo.
  */
 struct ieee80211_channel *
 ieee80211_find_channel_byieee(struct ieee80211com *ic, int ieee, int flags)
 {
 	struct ieee80211_channel *c;
 	int i;
 
 	flags &= IEEE80211_CHAN_ALLTURBO;
 	c = ic->ic_prevchan;
 	if (c != NULL && c->ic_ieee == ieee &&
 	    (c->ic_flags & IEEE80211_CHAN_ALLTURBO) == flags)
 		return c;
 	/* brute force search */
 	for (i = 0; i < ic->ic_nchans; i++) {
 		c = &ic->ic_channels[i];
 		if (c->ic_ieee == ieee &&
 		    (c->ic_flags & IEEE80211_CHAN_ALLTURBO) == flags)
 			return c;
 	}
 	return NULL;
 }
 
 /*
  * Lookup a channel suitable for the given rx status.
  *
  * This is used to find a channel for a frame (eg beacon, probe
  * response) based purely on the received PHY information.
  *
  * For now it tries to do it based on R_FREQ / R_IEEE.
  * This is enough for 11bg and 11a (and thus 11ng/11na)
  * but it will not be enough for GSM, PSB channels and the
  * like.  It also doesn't know about legacy-turbog and
  * legacy-turbo modes, which some offload NICs actually
  * support in weird ways.
  *
  * Takes the ic and rxstatus; returns the channel or NULL
  * if not found.
  *
  * XXX TODO: Add support for that when the need arises.
  */
 struct ieee80211_channel *
 ieee80211_lookup_channel_rxstatus(struct ieee80211vap *vap,
     const struct ieee80211_rx_stats *rxs)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	uint32_t flags;
 	struct ieee80211_channel *c;
 
 	if (rxs == NULL)
 		return (NULL);
 
 	/*
 	 * Strictly speaking we only use freq for now,
 	 * however later on we may wish to just store
 	 * the ieee for verification.
 	 */
 	if ((rxs->r_flags & IEEE80211_R_FREQ) == 0)
 		return (NULL);
 	if ((rxs->r_flags & IEEE80211_R_IEEE) == 0)
 		return (NULL);
 	if ((rxs->r_flags & IEEE80211_R_BAND) == 0)
 		return (NULL);
 
 	/*
 	 * If the rx status contains a valid ieee/freq, then
 	 * ensure we populate the correct channel information
 	 * in rxchan before passing it up to the scan infrastructure.
 	 * Offload NICs will pass up beacons from all channels
 	 * during background scans.
 	 */
 
 	/* Determine a band */
 	switch (rxs->c_band) {
 	case IEEE80211_CHAN_2GHZ:
 		flags = IEEE80211_CHAN_G;
 		break;
 	case IEEE80211_CHAN_5GHZ:
 		flags = IEEE80211_CHAN_A;
 		break;
 	default:
 		if (rxs->c_freq < 3000) {
 			flags = IEEE80211_CHAN_G;
 		} else {
 			flags = IEEE80211_CHAN_A;
 		}
 		break;
 	}
 
 	/* Channel lookup */
 	c = ieee80211_find_channel(ic, rxs->c_freq, flags);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_INPUT,
 	    "%s: freq=%d, ieee=%d, flags=0x%08x; c=%p\n",
 	    __func__, (int) rxs->c_freq, (int) rxs->c_ieee, flags, c);
 
 	return (c);
 }
 
 static void
 addmedia(struct ifmedia *media, int caps, int addsta, int mode, int mword)
 {
 #define	ADD(_ic, _s, _o) \
 	ifmedia_add(media, \
 		IFM_MAKEWORD(IFM_IEEE80211, (_s), (_o), 0), 0, NULL)
 	static const u_int mopts[IEEE80211_MODE_MAX] = {
 	    [IEEE80211_MODE_AUTO]	= IFM_AUTO,
 	    [IEEE80211_MODE_11A]	= IFM_IEEE80211_11A,
 	    [IEEE80211_MODE_11B]	= IFM_IEEE80211_11B,
 	    [IEEE80211_MODE_11G]	= IFM_IEEE80211_11G,
 	    [IEEE80211_MODE_FH]		= IFM_IEEE80211_FH,
 	    [IEEE80211_MODE_TURBO_A]	= IFM_IEEE80211_11A|IFM_IEEE80211_TURBO,
 	    [IEEE80211_MODE_TURBO_G]	= IFM_IEEE80211_11G|IFM_IEEE80211_TURBO,
 	    [IEEE80211_MODE_STURBO_A]	= IFM_IEEE80211_11A|IFM_IEEE80211_TURBO,
 	    [IEEE80211_MODE_HALF]	= IFM_IEEE80211_11A,	/* XXX */
 	    [IEEE80211_MODE_QUARTER]	= IFM_IEEE80211_11A,	/* XXX */
 	    [IEEE80211_MODE_11NA]	= IFM_IEEE80211_11NA,
 	    [IEEE80211_MODE_11NG]	= IFM_IEEE80211_11NG,
 	    [IEEE80211_MODE_VHT_2GHZ]	= IFM_IEEE80211_VHT2G,
 	    [IEEE80211_MODE_VHT_5GHZ]	= IFM_IEEE80211_VHT5G,
 	};
 	u_int mopt;
 
 	mopt = mopts[mode];
 	if (addsta)
 		ADD(ic, mword, mopt);	/* STA mode has no cap */
 	if (caps & IEEE80211_C_IBSS)
 		ADD(media, mword, mopt | IFM_IEEE80211_ADHOC);
 	if (caps & IEEE80211_C_HOSTAP)
 		ADD(media, mword, mopt | IFM_IEEE80211_HOSTAP);
 	if (caps & IEEE80211_C_AHDEMO)
 		ADD(media, mword, mopt | IFM_IEEE80211_ADHOC | IFM_FLAG0);
 	if (caps & IEEE80211_C_MONITOR)
 		ADD(media, mword, mopt | IFM_IEEE80211_MONITOR);
 	if (caps & IEEE80211_C_WDS)
 		ADD(media, mword, mopt | IFM_IEEE80211_WDS);
 	if (caps & IEEE80211_C_MBSS)
 		ADD(media, mword, mopt | IFM_IEEE80211_MBSS);
 #undef ADD
 }
 
 /*
  * Setup the media data structures according to the channel and
  * rate tables.
  */
 static int
 ieee80211_media_setup(struct ieee80211com *ic,
 	struct ifmedia *media, int caps, int addsta,
 	ifm_change_cb_t media_change, ifm_stat_cb_t media_stat)
 {
 	int i, j, rate, maxrate, mword, r;
 	enum ieee80211_phymode mode;
 	const struct ieee80211_rateset *rs;
 	struct ieee80211_rateset allrates;
 
 	/*
 	 * Fill in media characteristics.
 	 */
 	ifmedia_init(media, 0, media_change, media_stat);
 	maxrate = 0;
 	/*
 	 * Add media for legacy operating modes.
 	 */
 	memset(&allrates, 0, sizeof(allrates));
 	for (mode = IEEE80211_MODE_AUTO; mode < IEEE80211_MODE_11NA; mode++) {
 		if (isclr(ic->ic_modecaps, mode))
 			continue;
 		addmedia(media, caps, addsta, mode, IFM_AUTO);
 		if (mode == IEEE80211_MODE_AUTO)
 			continue;
 		rs = &ic->ic_sup_rates[mode];
 		for (i = 0; i < rs->rs_nrates; i++) {
 			rate = rs->rs_rates[i];
 			mword = ieee80211_rate2media(ic, rate, mode);
 			if (mword == 0)
 				continue;
 			addmedia(media, caps, addsta, mode, mword);
 			/*
 			 * Add legacy rate to the collection of all rates.
 			 */
 			r = rate & IEEE80211_RATE_VAL;
 			for (j = 0; j < allrates.rs_nrates; j++)
 				if (allrates.rs_rates[j] == r)
 					break;
 			if (j == allrates.rs_nrates) {
 				/* unique, add to the set */
 				allrates.rs_rates[j] = r;
 				allrates.rs_nrates++;
 			}
 			rate = (rate & IEEE80211_RATE_VAL) / 2;
 			if (rate > maxrate)
 				maxrate = rate;
 		}
 	}
 	for (i = 0; i < allrates.rs_nrates; i++) {
 		mword = ieee80211_rate2media(ic, allrates.rs_rates[i],
 				IEEE80211_MODE_AUTO);
 		if (mword == 0)
 			continue;
 		/* NB: remove media options from mword */
 		addmedia(media, caps, addsta,
 		    IEEE80211_MODE_AUTO, IFM_SUBTYPE(mword));
 	}
 	/*
 	 * Add HT/11n media.  Note that we do not have enough
 	 * bits in the media subtype to express the MCS so we
 	 * use a "placeholder" media subtype and any fixed MCS
 	 * must be specified with a different mechanism.
 	 */
 	for (; mode <= IEEE80211_MODE_11NG; mode++) {
 		if (isclr(ic->ic_modecaps, mode))
 			continue;
 		addmedia(media, caps, addsta, mode, IFM_AUTO);
 		addmedia(media, caps, addsta, mode, IFM_IEEE80211_MCS);
 	}
 	if (isset(ic->ic_modecaps, IEEE80211_MODE_11NA) ||
 	    isset(ic->ic_modecaps, IEEE80211_MODE_11NG)) {
 		addmedia(media, caps, addsta,
 		    IEEE80211_MODE_AUTO, IFM_IEEE80211_MCS);
 		i = ic->ic_txstream * 8 - 1;
 		if ((ic->ic_htcaps & IEEE80211_HTCAP_CHWIDTH40) &&
 		    (ic->ic_htcaps & IEEE80211_HTCAP_SHORTGI40))
 			rate = ieee80211_htrates[i].ht40_rate_400ns;
 		else if ((ic->ic_htcaps & IEEE80211_HTCAP_CHWIDTH40))
 			rate = ieee80211_htrates[i].ht40_rate_800ns;
 		else if ((ic->ic_htcaps & IEEE80211_HTCAP_SHORTGI20))
 			rate = ieee80211_htrates[i].ht20_rate_400ns;
 		else
 			rate = ieee80211_htrates[i].ht20_rate_800ns;
 		if (rate > maxrate)
 			maxrate = rate;
 	}
 
 	/*
 	 * Add VHT media.
 	 * XXX-BZ skip "VHT_2GHZ" for now.
 	 */
 	for (mode = IEEE80211_MODE_VHT_5GHZ; mode <= IEEE80211_MODE_VHT_5GHZ;
 	    mode++) {
 		if (isclr(ic->ic_modecaps, mode))
 			continue;
 		addmedia(media, caps, addsta, mode, IFM_AUTO);
 		addmedia(media, caps, addsta, mode, IFM_IEEE80211_VHT);
 	}
 	if (isset(ic->ic_modecaps, IEEE80211_MODE_VHT_5GHZ)) {
 	       addmedia(media, caps, addsta,
 		   IEEE80211_MODE_AUTO, IFM_IEEE80211_VHT);
 
 		/* XXX TODO: VHT maxrate */
 	}
 
 	return maxrate;
 }
 
 /* XXX inline or eliminate? */
 const struct ieee80211_rateset *
 ieee80211_get_suprates(struct ieee80211com *ic, const struct ieee80211_channel *c)
 {
 	/* XXX does this work for 11ng basic rates? */
 	return &ic->ic_sup_rates[ieee80211_chan2mode(c)];
 }
 
 /* XXX inline or eliminate? */
 const struct ieee80211_htrateset *
 ieee80211_get_suphtrates(struct ieee80211com *ic,
     const struct ieee80211_channel *c)
 {
 	return &ic->ic_sup_htrates;
 }
 
 void
 ieee80211_announce(struct ieee80211com *ic)
 {
 	int i, rate, mword;
 	enum ieee80211_phymode mode;
 	const struct ieee80211_rateset *rs;
 
 	/* NB: skip AUTO since it has no rates */
 	for (mode = IEEE80211_MODE_AUTO+1; mode < IEEE80211_MODE_11NA; mode++) {
 		if (isclr(ic->ic_modecaps, mode))
 			continue;
 		ic_printf(ic, "%s rates: ", ieee80211_phymode_name[mode]);
 		rs = &ic->ic_sup_rates[mode];
 		for (i = 0; i < rs->rs_nrates; i++) {
 			mword = ieee80211_rate2media(ic, rs->rs_rates[i], mode);
 			if (mword == 0)
 				continue;
 			rate = ieee80211_media2rate(mword);
 			printf("%s%d%sMbps", (i != 0 ? " " : ""),
 			    rate / 2, ((rate & 0x1) != 0 ? ".5" : ""));
 		}
 		printf("\n");
 	}
 	ieee80211_ht_announce(ic);
 	ieee80211_vht_announce(ic);
 }
 
 void
 ieee80211_announce_channels(struct ieee80211com *ic)
 {
 	const struct ieee80211_channel *c;
 	char type;
 	int i, cw;
 
 	printf("Chan  Freq  CW  RegPwr  MinPwr  MaxPwr\n");
 	for (i = 0; i < ic->ic_nchans; i++) {
 		c = &ic->ic_channels[i];
 		if (IEEE80211_IS_CHAN_ST(c))
 			type = 'S';
 		else if (IEEE80211_IS_CHAN_108A(c))
 			type = 'T';
 		else if (IEEE80211_IS_CHAN_108G(c))
 			type = 'G';
 		else if (IEEE80211_IS_CHAN_HT(c))
 			type = 'n';
 		else if (IEEE80211_IS_CHAN_A(c))
 			type = 'a';
 		else if (IEEE80211_IS_CHAN_ANYG(c))
 			type = 'g';
 		else if (IEEE80211_IS_CHAN_B(c))
 			type = 'b';
 		else
 			type = 'f';
 		if (IEEE80211_IS_CHAN_HT40(c) || IEEE80211_IS_CHAN_TURBO(c))
 			cw = 40;
 		else if (IEEE80211_IS_CHAN_HALF(c))
 			cw = 10;
 		else if (IEEE80211_IS_CHAN_QUARTER(c))
 			cw = 5;
 		else
 			cw = 20;
 		printf("%4d  %4d%c %2d%c %6d  %4d.%d  %4d.%d\n"
 			, c->ic_ieee, c->ic_freq, type
 			, cw
 			, IEEE80211_IS_CHAN_HT40U(c) ? '+' :
 			  IEEE80211_IS_CHAN_HT40D(c) ? '-' : ' '
 			, c->ic_maxregpower
 			, c->ic_minpower / 2, c->ic_minpower & 1 ? 5 : 0
 			, c->ic_maxpower / 2, c->ic_maxpower & 1 ? 5 : 0
 		);
 	}
 }
 
 static int
 media2mode(const struct ifmedia_entry *ime, uint32_t flags, uint16_t *mode)
 {
 	switch (IFM_MODE(ime->ifm_media)) {
 	case IFM_IEEE80211_11A:
 		*mode = IEEE80211_MODE_11A;
 		break;
 	case IFM_IEEE80211_11B:
 		*mode = IEEE80211_MODE_11B;
 		break;
 	case IFM_IEEE80211_11G:
 		*mode = IEEE80211_MODE_11G;
 		break;
 	case IFM_IEEE80211_FH:
 		*mode = IEEE80211_MODE_FH;
 		break;
 	case IFM_IEEE80211_11NA:
 		*mode = IEEE80211_MODE_11NA;
 		break;
 	case IFM_IEEE80211_11NG:
 		*mode = IEEE80211_MODE_11NG;
 		break;
 	case IFM_IEEE80211_VHT2G:
 		*mode = IEEE80211_MODE_VHT_2GHZ;
 		break;
 	case IFM_IEEE80211_VHT5G:
 		*mode = IEEE80211_MODE_VHT_5GHZ;
 		break;
 	case IFM_AUTO:
 		*mode = IEEE80211_MODE_AUTO;
 		break;
 	default:
 		return 0;
 	}
 	/*
 	 * Turbo mode is an ``option''.
 	 * XXX does not apply to AUTO
 	 */
 	if (ime->ifm_media & IFM_IEEE80211_TURBO) {
 		if (*mode == IEEE80211_MODE_11A) {
 			if (flags & IEEE80211_F_TURBOP)
 				*mode = IEEE80211_MODE_TURBO_A;
 			else
 				*mode = IEEE80211_MODE_STURBO_A;
 		} else if (*mode == IEEE80211_MODE_11G)
 			*mode = IEEE80211_MODE_TURBO_G;
 		else
 			return 0;
 	}
 	/* XXX HT40 +/- */
 	return 1;
 }
 
 /*
  * Handle a media change request on the vap interface.
  */
 int
 ieee80211_media_change(struct ifnet *ifp)
 {
 	struct ieee80211vap *vap = ifp->if_softc;
 	struct ifmedia_entry *ime = vap->iv_media.ifm_cur;
 	uint16_t newmode;
 
 	if (!media2mode(ime, vap->iv_flags, &newmode))
 		return EINVAL;
 	if (vap->iv_des_mode != newmode) {
 		vap->iv_des_mode = newmode;
 		/* XXX kick state machine if up+running */
 	}
 	return 0;
 }
 
 /*
  * Common code to calculate the media status word
  * from the operating mode and channel state.
  */
 static int
 media_status(enum ieee80211_opmode opmode, const struct ieee80211_channel *chan)
 {
 	int status;
 
 	status = IFM_IEEE80211;
 	switch (opmode) {
 	case IEEE80211_M_STA:
 		break;
 	case IEEE80211_M_IBSS:
 		status |= IFM_IEEE80211_ADHOC;
 		break;
 	case IEEE80211_M_HOSTAP:
 		status |= IFM_IEEE80211_HOSTAP;
 		break;
 	case IEEE80211_M_MONITOR:
 		status |= IFM_IEEE80211_MONITOR;
 		break;
 	case IEEE80211_M_AHDEMO:
 		status |= IFM_IEEE80211_ADHOC | IFM_FLAG0;
 		break;
 	case IEEE80211_M_WDS:
 		status |= IFM_IEEE80211_WDS;
 		break;
 	case IEEE80211_M_MBSS:
 		status |= IFM_IEEE80211_MBSS;
 		break;
 	}
 	if (IEEE80211_IS_CHAN_VHT_5GHZ(chan)) {
 		status |= IFM_IEEE80211_VHT5G;
 	} else if (IEEE80211_IS_CHAN_VHT_2GHZ(chan)) {
 		status |= IFM_IEEE80211_VHT2G;
 	} else if (IEEE80211_IS_CHAN_HTA(chan)) {
 		status |= IFM_IEEE80211_11NA;
 	} else if (IEEE80211_IS_CHAN_HTG(chan)) {
 		status |= IFM_IEEE80211_11NG;
 	} else if (IEEE80211_IS_CHAN_A(chan)) {
 		status |= IFM_IEEE80211_11A;
 	} else if (IEEE80211_IS_CHAN_B(chan)) {
 		status |= IFM_IEEE80211_11B;
 	} else if (IEEE80211_IS_CHAN_ANYG(chan)) {
 		status |= IFM_IEEE80211_11G;
 	} else if (IEEE80211_IS_CHAN_FHSS(chan)) {
 		status |= IFM_IEEE80211_FH;
 	}
 	/* XXX else complain? */
 
 	if (IEEE80211_IS_CHAN_TURBO(chan))
 		status |= IFM_IEEE80211_TURBO;
 #if 0
 	if (IEEE80211_IS_CHAN_HT20(chan))
 		status |= IFM_IEEE80211_HT20;
 	if (IEEE80211_IS_CHAN_HT40(chan))
 		status |= IFM_IEEE80211_HT40;
 #endif
 	return status;
 }
 
 void
 ieee80211_media_status(struct ifnet *ifp, struct ifmediareq *imr)
 {
 	struct ieee80211vap *vap = ifp->if_softc;
 	struct ieee80211com *ic = vap->iv_ic;
 	enum ieee80211_phymode mode;
 
 	imr->ifm_status = IFM_AVALID;
 	/*
 	 * NB: use the current channel's mode to lock down a xmit
 	 * rate only when running; otherwise we may have a mismatch
 	 * in which case the rate will not be convertible.
 	 */
 	if (vap->iv_state == IEEE80211_S_RUN ||
 	    vap->iv_state == IEEE80211_S_SLEEP) {
 		imr->ifm_status |= IFM_ACTIVE;
 		mode = ieee80211_chan2mode(ic->ic_curchan);
 	} else
 		mode = IEEE80211_MODE_AUTO;
 	imr->ifm_active = media_status(vap->iv_opmode, ic->ic_curchan);
 	/*
 	 * Calculate a current rate if possible.
 	 */
 	if (vap->iv_txparms[mode].ucastrate != IEEE80211_FIXED_RATE_NONE) {
 		/*
 		 * A fixed rate is set, report that.
 		 */
 		imr->ifm_active |= ieee80211_rate2media(ic,
 			vap->iv_txparms[mode].ucastrate, mode);
 	} else if (vap->iv_opmode == IEEE80211_M_STA) {
 		/*
 		 * In station mode report the current transmit rate.
 		 */
 		imr->ifm_active |= ieee80211_rate2media(ic,
 			vap->iv_bss->ni_txrate, mode);
 	} else
 		imr->ifm_active |= IFM_AUTO;
 	if (imr->ifm_status & IFM_ACTIVE)
 		imr->ifm_current = imr->ifm_active;
 }
 
 /*
  * Set the current phy mode and recalculate the active channel
  * set based on the available channels for this mode.  Also
  * select a new default/current channel if the current one is
  * inappropriate for this mode.
  */
 int
 ieee80211_setmode(struct ieee80211com *ic, enum ieee80211_phymode mode)
 {
 	/*
 	 * Adjust basic rates in 11b/11g supported rate set.
 	 * Note that if operating on a hal/quarter rate channel
 	 * this is a noop as those rates sets are different
 	 * and used instead.
 	 */
 	if (mode == IEEE80211_MODE_11G || mode == IEEE80211_MODE_11B)
 		ieee80211_setbasicrates(&ic->ic_sup_rates[mode], mode);
 
 	ic->ic_curmode = mode;
 	ieee80211_reset_erp(ic);	/* reset global ERP state */
 
 	return 0;
 }
 
 /*
  * Return the phy mode for with the specified channel.
  */
 enum ieee80211_phymode
 ieee80211_chan2mode(const struct ieee80211_channel *chan)
 {
 
 	if (IEEE80211_IS_CHAN_VHT_2GHZ(chan))
 		return IEEE80211_MODE_VHT_2GHZ;
 	else if (IEEE80211_IS_CHAN_VHT_5GHZ(chan))
 		return IEEE80211_MODE_VHT_5GHZ;
 	else if (IEEE80211_IS_CHAN_HTA(chan))
 		return IEEE80211_MODE_11NA;
 	else if (IEEE80211_IS_CHAN_HTG(chan))
 		return IEEE80211_MODE_11NG;
 	else if (IEEE80211_IS_CHAN_108G(chan))
 		return IEEE80211_MODE_TURBO_G;
 	else if (IEEE80211_IS_CHAN_ST(chan))
 		return IEEE80211_MODE_STURBO_A;
 	else if (IEEE80211_IS_CHAN_TURBO(chan))
 		return IEEE80211_MODE_TURBO_A;
 	else if (IEEE80211_IS_CHAN_HALF(chan))
 		return IEEE80211_MODE_HALF;
 	else if (IEEE80211_IS_CHAN_QUARTER(chan))
 		return IEEE80211_MODE_QUARTER;
 	else if (IEEE80211_IS_CHAN_A(chan))
 		return IEEE80211_MODE_11A;
 	else if (IEEE80211_IS_CHAN_ANYG(chan))
 		return IEEE80211_MODE_11G;
 	else if (IEEE80211_IS_CHAN_B(chan))
 		return IEEE80211_MODE_11B;
 	else if (IEEE80211_IS_CHAN_FHSS(chan))
 		return IEEE80211_MODE_FH;
 
 	/* NB: should not get here */
 	printf("%s: cannot map channel to mode; freq %u flags 0x%x\n",
 		__func__, chan->ic_freq, chan->ic_flags);
 	return IEEE80211_MODE_11B;
 }
 
 struct ratemedia {
 	u_int	match;	/* rate + mode */
 	u_int	media;	/* if_media rate */
 };
 
 static int
 findmedia(const struct ratemedia rates[], int n, u_int match)
 {
 	int i;
 
 	for (i = 0; i < n; i++)
 		if (rates[i].match == match)
 			return rates[i].media;
 	return IFM_AUTO;
 }
 
 /*
  * Convert IEEE80211 rate value to ifmedia subtype.
  * Rate is either a legacy rate in units of 0.5Mbps
  * or an MCS index.
  */
 int
 ieee80211_rate2media(struct ieee80211com *ic, int rate, enum ieee80211_phymode mode)
 {
 	static const struct ratemedia rates[] = {
 		{   2 | IFM_IEEE80211_FH, IFM_IEEE80211_FH1 },
 		{   4 | IFM_IEEE80211_FH, IFM_IEEE80211_FH2 },
 		{   2 | IFM_IEEE80211_11B, IFM_IEEE80211_DS1 },
 		{   4 | IFM_IEEE80211_11B, IFM_IEEE80211_DS2 },
 		{  11 | IFM_IEEE80211_11B, IFM_IEEE80211_DS5 },
 		{  22 | IFM_IEEE80211_11B, IFM_IEEE80211_DS11 },
 		{  44 | IFM_IEEE80211_11B, IFM_IEEE80211_DS22 },
 		{  12 | IFM_IEEE80211_11A, IFM_IEEE80211_OFDM6 },
 		{  18 | IFM_IEEE80211_11A, IFM_IEEE80211_OFDM9 },
 		{  24 | IFM_IEEE80211_11A, IFM_IEEE80211_OFDM12 },
 		{  36 | IFM_IEEE80211_11A, IFM_IEEE80211_OFDM18 },
 		{  48 | IFM_IEEE80211_11A, IFM_IEEE80211_OFDM24 },
 		{  72 | IFM_IEEE80211_11A, IFM_IEEE80211_OFDM36 },
 		{  96 | IFM_IEEE80211_11A, IFM_IEEE80211_OFDM48 },
 		{ 108 | IFM_IEEE80211_11A, IFM_IEEE80211_OFDM54 },
 		{   2 | IFM_IEEE80211_11G, IFM_IEEE80211_DS1 },
 		{   4 | IFM_IEEE80211_11G, IFM_IEEE80211_DS2 },
 		{  11 | IFM_IEEE80211_11G, IFM_IEEE80211_DS5 },
 		{  22 | IFM_IEEE80211_11G, IFM_IEEE80211_DS11 },
 		{  12 | IFM_IEEE80211_11G, IFM_IEEE80211_OFDM6 },
 		{  18 | IFM_IEEE80211_11G, IFM_IEEE80211_OFDM9 },
 		{  24 | IFM_IEEE80211_11G, IFM_IEEE80211_OFDM12 },
 		{  36 | IFM_IEEE80211_11G, IFM_IEEE80211_OFDM18 },
 		{  48 | IFM_IEEE80211_11G, IFM_IEEE80211_OFDM24 },
 		{  72 | IFM_IEEE80211_11G, IFM_IEEE80211_OFDM36 },
 		{  96 | IFM_IEEE80211_11G, IFM_IEEE80211_OFDM48 },
 		{ 108 | IFM_IEEE80211_11G, IFM_IEEE80211_OFDM54 },
 		{   6 | IFM_IEEE80211_11A, IFM_IEEE80211_OFDM3 },
 		{   9 | IFM_IEEE80211_11A, IFM_IEEE80211_OFDM4 },
 		{  54 | IFM_IEEE80211_11A, IFM_IEEE80211_OFDM27 },
 		/* NB: OFDM72 doesn't really exist so we don't handle it */
 	};
 	static const struct ratemedia htrates[] = {
 		{   0, IFM_IEEE80211_MCS },
 		{   1, IFM_IEEE80211_MCS },
 		{   2, IFM_IEEE80211_MCS },
 		{   3, IFM_IEEE80211_MCS },
 		{   4, IFM_IEEE80211_MCS },
 		{   5, IFM_IEEE80211_MCS },
 		{   6, IFM_IEEE80211_MCS },
 		{   7, IFM_IEEE80211_MCS },
 		{   8, IFM_IEEE80211_MCS },
 		{   9, IFM_IEEE80211_MCS },
 		{  10, IFM_IEEE80211_MCS },
 		{  11, IFM_IEEE80211_MCS },
 		{  12, IFM_IEEE80211_MCS },
 		{  13, IFM_IEEE80211_MCS },
 		{  14, IFM_IEEE80211_MCS },
 		{  15, IFM_IEEE80211_MCS },
 		{  16, IFM_IEEE80211_MCS },
 		{  17, IFM_IEEE80211_MCS },
 		{  18, IFM_IEEE80211_MCS },
 		{  19, IFM_IEEE80211_MCS },
 		{  20, IFM_IEEE80211_MCS },
 		{  21, IFM_IEEE80211_MCS },
 		{  22, IFM_IEEE80211_MCS },
 		{  23, IFM_IEEE80211_MCS },
 		{  24, IFM_IEEE80211_MCS },
 		{  25, IFM_IEEE80211_MCS },
 		{  26, IFM_IEEE80211_MCS },
 		{  27, IFM_IEEE80211_MCS },
 		{  28, IFM_IEEE80211_MCS },
 		{  29, IFM_IEEE80211_MCS },
 		{  30, IFM_IEEE80211_MCS },
 		{  31, IFM_IEEE80211_MCS },
 		{  32, IFM_IEEE80211_MCS },
 		{  33, IFM_IEEE80211_MCS },
 		{  34, IFM_IEEE80211_MCS },
 		{  35, IFM_IEEE80211_MCS },
 		{  36, IFM_IEEE80211_MCS },
 		{  37, IFM_IEEE80211_MCS },
 		{  38, IFM_IEEE80211_MCS },
 		{  39, IFM_IEEE80211_MCS },
 		{  40, IFM_IEEE80211_MCS },
 		{  41, IFM_IEEE80211_MCS },
 		{  42, IFM_IEEE80211_MCS },
 		{  43, IFM_IEEE80211_MCS },
 		{  44, IFM_IEEE80211_MCS },
 		{  45, IFM_IEEE80211_MCS },
 		{  46, IFM_IEEE80211_MCS },
 		{  47, IFM_IEEE80211_MCS },
 		{  48, IFM_IEEE80211_MCS },
 		{  49, IFM_IEEE80211_MCS },
 		{  50, IFM_IEEE80211_MCS },
 		{  51, IFM_IEEE80211_MCS },
 		{  52, IFM_IEEE80211_MCS },
 		{  53, IFM_IEEE80211_MCS },
 		{  54, IFM_IEEE80211_MCS },
 		{  55, IFM_IEEE80211_MCS },
 		{  56, IFM_IEEE80211_MCS },
 		{  57, IFM_IEEE80211_MCS },
 		{  58, IFM_IEEE80211_MCS },
 		{  59, IFM_IEEE80211_MCS },
 		{  60, IFM_IEEE80211_MCS },
 		{  61, IFM_IEEE80211_MCS },
 		{  62, IFM_IEEE80211_MCS },
 		{  63, IFM_IEEE80211_MCS },
 		{  64, IFM_IEEE80211_MCS },
 		{  65, IFM_IEEE80211_MCS },
 		{  66, IFM_IEEE80211_MCS },
 		{  67, IFM_IEEE80211_MCS },
 		{  68, IFM_IEEE80211_MCS },
 		{  69, IFM_IEEE80211_MCS },
 		{  70, IFM_IEEE80211_MCS },
 		{  71, IFM_IEEE80211_MCS },
 		{  72, IFM_IEEE80211_MCS },
 		{  73, IFM_IEEE80211_MCS },
 		{  74, IFM_IEEE80211_MCS },
 		{  75, IFM_IEEE80211_MCS },
 		{  76, IFM_IEEE80211_MCS },
 	};
 	static const struct ratemedia vhtrates[] = {
 		{   0, IFM_IEEE80211_VHT },
 		{   1, IFM_IEEE80211_VHT },
 		{   2, IFM_IEEE80211_VHT },
 		{   3, IFM_IEEE80211_VHT },
 		{   4, IFM_IEEE80211_VHT },
 		{   5, IFM_IEEE80211_VHT },
 		{   6, IFM_IEEE80211_VHT },
 		{   7, IFM_IEEE80211_VHT },
 		{   8, IFM_IEEE80211_VHT },	/* Optional. */
 		{   9, IFM_IEEE80211_VHT },	/* Optional. */
 #if 0
 		/* Some QCA and BRCM seem to support this; offspec. */
 		{  10, IFM_IEEE80211_VHT },
 		{  11, IFM_IEEE80211_VHT },
 #endif
 	};
 	int m;
 
 	/*
 	 * Check 11ac/11n rates first for match as an MCS.
 	 */
 	if (mode == IEEE80211_MODE_VHT_5GHZ) {
 		if (rate & IFM_IEEE80211_VHT) {
 			rate &= ~IFM_IEEE80211_VHT;
 			m = findmedia(vhtrates, nitems(vhtrates), rate);
 			if (m != IFM_AUTO)
 				return (m | IFM_IEEE80211_VHT);
 		}
 	} else if (mode == IEEE80211_MODE_11NA) {
 		if (rate & IEEE80211_RATE_MCS) {
 			rate &= ~IEEE80211_RATE_MCS;
 			m = findmedia(htrates, nitems(htrates), rate);
 			if (m != IFM_AUTO)
 				return m | IFM_IEEE80211_11NA;
 		}
 	} else if (mode == IEEE80211_MODE_11NG) {
 		/* NB: 12 is ambiguous, it will be treated as an MCS */
 		if (rate & IEEE80211_RATE_MCS) {
 			rate &= ~IEEE80211_RATE_MCS;
 			m = findmedia(htrates, nitems(htrates), rate);
 			if (m != IFM_AUTO)
 				return m | IFM_IEEE80211_11NG;
 		}
 	}
 	rate &= IEEE80211_RATE_VAL;
 	switch (mode) {
 	case IEEE80211_MODE_11A:
 	case IEEE80211_MODE_HALF:		/* XXX good 'nuf */
 	case IEEE80211_MODE_QUARTER:
 	case IEEE80211_MODE_11NA:
 	case IEEE80211_MODE_TURBO_A:
 	case IEEE80211_MODE_STURBO_A:
 		return findmedia(rates, nitems(rates),
 		    rate | IFM_IEEE80211_11A);
 	case IEEE80211_MODE_11B:
 		return findmedia(rates, nitems(rates),
 		    rate | IFM_IEEE80211_11B);
 	case IEEE80211_MODE_FH:
 		return findmedia(rates, nitems(rates),
 		    rate | IFM_IEEE80211_FH);
 	case IEEE80211_MODE_AUTO:
 		/* NB: ic may be NULL for some drivers */
 		if (ic != NULL && ic->ic_phytype == IEEE80211_T_FH)
 			return findmedia(rates, nitems(rates),
 			    rate | IFM_IEEE80211_FH);
 		/* NB: hack, 11g matches both 11b+11a rates */
 		/* fall thru... */
 	case IEEE80211_MODE_11G:
 	case IEEE80211_MODE_11NG:
 	case IEEE80211_MODE_TURBO_G:
 		return findmedia(rates, nitems(rates), rate | IFM_IEEE80211_11G);
 	case IEEE80211_MODE_VHT_2GHZ:
 	case IEEE80211_MODE_VHT_5GHZ:
 		/* XXX TODO: need to figure out mapping for VHT rates */
 		return IFM_AUTO;
 	}
 	return IFM_AUTO;
 }
 
 int
 ieee80211_media2rate(int mword)
 {
 	static const int ieeerates[] = {
 		-1,		/* IFM_AUTO */
 		0,		/* IFM_MANUAL */
 		0,		/* IFM_NONE */
 		2,		/* IFM_IEEE80211_FH1 */
 		4,		/* IFM_IEEE80211_FH2 */
 		2,		/* IFM_IEEE80211_DS1 */
 		4,		/* IFM_IEEE80211_DS2 */
 		11,		/* IFM_IEEE80211_DS5 */
 		22,		/* IFM_IEEE80211_DS11 */
 		44,		/* IFM_IEEE80211_DS22 */
 		12,		/* IFM_IEEE80211_OFDM6 */
 		18,		/* IFM_IEEE80211_OFDM9 */
 		24,		/* IFM_IEEE80211_OFDM12 */
 		36,		/* IFM_IEEE80211_OFDM18 */
 		48,		/* IFM_IEEE80211_OFDM24 */
 		72,		/* IFM_IEEE80211_OFDM36 */
 		96,		/* IFM_IEEE80211_OFDM48 */
 		108,		/* IFM_IEEE80211_OFDM54 */
 		144,		/* IFM_IEEE80211_OFDM72 */
 		0,		/* IFM_IEEE80211_DS354k */
 		0,		/* IFM_IEEE80211_DS512k */
 		6,		/* IFM_IEEE80211_OFDM3 */
 		9,		/* IFM_IEEE80211_OFDM4 */
 		54,		/* IFM_IEEE80211_OFDM27 */
 		-1,		/* IFM_IEEE80211_MCS */
 		-1,		/* IFM_IEEE80211_VHT */
 	};
 	return IFM_SUBTYPE(mword) < nitems(ieeerates) ?
 		ieeerates[IFM_SUBTYPE(mword)] : 0;
 }
 
 /*
  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
  */
 #define	mix(a, b, c)							\
 do {									\
 	a -= b; a -= c; a ^= (c >> 13);					\
 	b -= c; b -= a; b ^= (a << 8);					\
 	c -= a; c -= b; c ^= (b >> 13);					\
 	a -= b; a -= c; a ^= (c >> 12);					\
 	b -= c; b -= a; b ^= (a << 16);					\
 	c -= a; c -= b; c ^= (b >> 5);					\
 	a -= b; a -= c; a ^= (c >> 3);					\
 	b -= c; b -= a; b ^= (a << 10);					\
 	c -= a; c -= b; c ^= (b >> 15);					\
 } while (/*CONSTCOND*/0)
 
 uint32_t
 ieee80211_mac_hash(const struct ieee80211com *ic,
 	const uint8_t addr[IEEE80211_ADDR_LEN])
 {
 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = ic->ic_hash_key;
 
 	b += addr[5] << 8;
 	b += addr[4];
 	a += addr[3] << 24;
 	a += addr[2] << 16;
 	a += addr[1] << 8;
 	a += addr[0];
 
 	mix(a, b, c);
 
 	return c;
 }
 #undef mix
 
 char
 ieee80211_channel_type_char(const struct ieee80211_channel *c)
 {
 	if (IEEE80211_IS_CHAN_ST(c))
 		return 'S';
 	if (IEEE80211_IS_CHAN_108A(c))
 		return 'T';
 	if (IEEE80211_IS_CHAN_108G(c))
 		return 'G';
 	if (IEEE80211_IS_CHAN_VHT(c))
 		return 'v';
 	if (IEEE80211_IS_CHAN_HT(c))
 		return 'n';
 	if (IEEE80211_IS_CHAN_A(c))
 		return 'a';
 	if (IEEE80211_IS_CHAN_ANYG(c))
 		return 'g';
 	if (IEEE80211_IS_CHAN_B(c))
 		return 'b';
 	return 'f';
 }
diff --git a/sys/net80211/ieee80211_adhoc.c b/sys/net80211/ieee80211_adhoc.c
index f591015ab3e0..4789bdd65aa1 100644
--- a/sys/net80211/ieee80211_adhoc.c
+++ b/sys/net80211/ieee80211_adhoc.c
@@ -1,1062 +1,1063 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007-2009 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #ifdef __FreeBSD__
 __FBSDID("$FreeBSD$");
 #endif
 
 /*
  * IEEE 802.11 IBSS mode support.
  */
 #include "opt_inet.h"
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/mbuf.h>   
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_llc.h>
+#include <net/if_private.h>
 #include <net/ethernet.h>
 
 #include <net/bpf.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_adhoc.h>
 #include <net80211/ieee80211_input.h>
 #ifdef IEEE80211_SUPPORT_SUPERG
 #include <net80211/ieee80211_superg.h>
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 #include <net80211/ieee80211_tdma.h>
 #endif
 #include <net80211/ieee80211_sta.h>
 
 #define	IEEE80211_RATE2MBS(r)	(((r) & IEEE80211_RATE_VAL) / 2)
 
 static	void adhoc_vattach(struct ieee80211vap *);
 static	int adhoc_newstate(struct ieee80211vap *, enum ieee80211_state, int);
 static int adhoc_input(struct ieee80211_node *, struct mbuf *,
 	    const struct ieee80211_rx_stats *, int, int);
 static void adhoc_recv_mgmt(struct ieee80211_node *, struct mbuf *,
 	int subtype, const struct ieee80211_rx_stats *, int, int);
 static void ahdemo_recv_mgmt(struct ieee80211_node *, struct mbuf *,
 	    int subtype, const struct ieee80211_rx_stats *rxs, int, int);
 static void adhoc_recv_ctl(struct ieee80211_node *, struct mbuf *, int subtype);
 
 void
 ieee80211_adhoc_attach(struct ieee80211com *ic)
 {
 	ic->ic_vattach[IEEE80211_M_IBSS] = adhoc_vattach;
 	ic->ic_vattach[IEEE80211_M_AHDEMO] = adhoc_vattach;
 }
 
 void
 ieee80211_adhoc_detach(struct ieee80211com *ic)
 {
 }
 
 static void
 adhoc_vdetach(struct ieee80211vap *vap)
 {
 }
 
 static void
 adhoc_vattach(struct ieee80211vap *vap)
 {
 	vap->iv_newstate = adhoc_newstate;
 	vap->iv_input = adhoc_input;
 	if (vap->iv_opmode == IEEE80211_M_IBSS)
 		vap->iv_recv_mgmt = adhoc_recv_mgmt;
 	else
 		vap->iv_recv_mgmt = ahdemo_recv_mgmt;
 	vap->iv_recv_ctl = adhoc_recv_ctl;
 	vap->iv_opdetach = adhoc_vdetach;
 #ifdef IEEE80211_SUPPORT_TDMA
 	/*
 	 * Throw control to tdma support.  Note we do this
 	 * after setting up our callbacks so it can piggyback
 	 * on top of us.
 	 */
 	if (vap->iv_caps & IEEE80211_C_TDMA)
 		ieee80211_tdma_vattach(vap);
 #endif
 }
 
 static void
 sta_leave(void *arg, struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 
 	if (ni != vap->iv_bss)
 		ieee80211_node_leave(ni);
 }
 
 /*
  * IEEE80211_M_IBSS+IEEE80211_M_AHDEMO vap state machine handler.
  */
 static int
 adhoc_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_node *ni;
 	enum ieee80211_state ostate;
 
 	IEEE80211_LOCK_ASSERT(vap->iv_ic);
 
 	ostate = vap->iv_state;
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s -> %s (%d)\n",
 	    __func__, ieee80211_state_name[ostate],
 	    ieee80211_state_name[nstate], arg);
 	vap->iv_state = nstate;			/* state transition */
 	if (ostate != IEEE80211_S_SCAN)
 		ieee80211_cancel_scan(vap);	/* background scan */
 	ni = vap->iv_bss;			/* NB: no reference held */
 	switch (nstate) {
 	case IEEE80211_S_INIT:
 		switch (ostate) {
 		case IEEE80211_S_SCAN:
 			ieee80211_cancel_scan(vap);
 			break;
 		default:
 			break;
 		}
 		if (ostate != IEEE80211_S_INIT) {
 			/* NB: optimize INIT -> INIT case */
 			ieee80211_reset_bss(vap);
 		}
 		break;
 	case IEEE80211_S_SCAN:
 		switch (ostate) {
 		case IEEE80211_S_RUN:		/* beacon miss */
 			/* purge station table; entries are stale */
 			ieee80211_iterate_nodes_vap(&ic->ic_sta, vap,
 			    sta_leave, NULL);
 			/* fall thru... */
 		case IEEE80211_S_INIT:
 			if (vap->iv_des_chan != IEEE80211_CHAN_ANYC &&
 			    !IEEE80211_IS_CHAN_RADAR(vap->iv_des_chan)) {
 				/*
 				 * Already have a channel; bypass the
 				 * scan and startup immediately.
 				 */
 				ieee80211_create_ibss(vap,
 				    ieee80211_ht_adjust_channel(ic,
 				    vap->iv_des_chan, vap->iv_flags_ht));
 				break;
 			}
 			/*
 			 * Initiate a scan.  We can come here as a result
 			 * of an IEEE80211_IOC_SCAN_REQ too in which case
 			 * the vap will be marked with IEEE80211_FEXT_SCANREQ
 			 * and the scan request parameters will be present
 			 * in iv_scanreq.  Otherwise we do the default.
 			 */
 			if (vap->iv_flags_ext & IEEE80211_FEXT_SCANREQ) {
 				ieee80211_check_scan(vap,
 				    vap->iv_scanreq_flags,
 				    vap->iv_scanreq_duration,
 				    vap->iv_scanreq_mindwell,
 				    vap->iv_scanreq_maxdwell,
 				    vap->iv_scanreq_nssid, vap->iv_scanreq_ssid);
 				vap->iv_flags_ext &= ~IEEE80211_FEXT_SCANREQ;
 			} else
 				ieee80211_check_scan_current(vap);
 			break;
 		case IEEE80211_S_SCAN:
 			/*
 			 * This can happen because of a change in state
 			 * that requires a reset.  Trigger a new scan
 			 * unless we're in manual roaming mode in which
 			 * case an application must issue an explicit request.
 			 */
 			if (vap->iv_roaming == IEEE80211_ROAMING_AUTO)
 				ieee80211_check_scan_current(vap);
 			break;
 		default:
 			goto invalid;
 		}
 		break;
 	case IEEE80211_S_RUN:
 		if (vap->iv_flags & IEEE80211_F_WPA) {
 			/* XXX validate prerequisites */
 		}
 		switch (ostate) {
 		case IEEE80211_S_INIT:
 			/*
 			 * Already have a channel; bypass the
 			 * scan and startup immediately.
 			 * Note that ieee80211_create_ibss will call
 			 * back to do a RUN->RUN state change.
 			 */
 			ieee80211_create_ibss(vap,
 			    ieee80211_ht_adjust_channel(ic,
 				ic->ic_curchan, vap->iv_flags_ht));
 			/* NB: iv_bss is changed on return */
 			ni = vap->iv_bss;
 			break;
 		case IEEE80211_S_SCAN:
 #ifdef IEEE80211_DEBUG
 			if (ieee80211_msg_debug(vap)) {
 				ieee80211_note(vap,
 				    "synchronized with %s ssid ",
 				    ether_sprintf(ni->ni_bssid));
 				ieee80211_print_essid(vap->iv_bss->ni_essid,
 				    ni->ni_esslen);
 				/* XXX MCS/HT */
 				printf(" channel %d start %uMb\n",
 				    ieee80211_chan2ieee(ic, ic->ic_curchan),
 				    IEEE80211_RATE2MBS(ni->ni_txrate));
 			}
 #endif
 			break;
 		case IEEE80211_S_RUN:	/* IBSS merge */
 			break;
 		default:
 			goto invalid;
 		}
 		/*
 		 * When 802.1x is not in use mark the port authorized
 		 * at this point so traffic can flow.
 		 */
 		if (ni->ni_authmode != IEEE80211_AUTH_8021X)
 			ieee80211_node_authorize(ni);
 		/*
 		 * Fake association when joining an existing bss.
 		 */
 		if (!IEEE80211_ADDR_EQ(ni->ni_macaddr, vap->iv_myaddr) &&
 		    ic->ic_newassoc != NULL)
 			ic->ic_newassoc(ni, ostate != IEEE80211_S_RUN);
 		break;
 	case IEEE80211_S_SLEEP:
 		vap->iv_sta_ps(vap, 0);
 		break;
 	default:
 	invalid:
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 		    "%s: unexpected state transition %s -> %s\n", __func__,
 		    ieee80211_state_name[ostate], ieee80211_state_name[nstate]);
 		break;
 	}
 	return 0;
 }
 
 /*
  * Decide if a received management frame should be
  * printed when debugging is enabled.  This filters some
  * of the less interesting frames that come frequently
  * (e.g. beacons).
  */
 static __inline int
 doprint(struct ieee80211vap *vap, int subtype)
 {
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_BEACON:
 		return (vap->iv_ic->ic_flags & IEEE80211_F_SCAN);
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 		return 1;
 	}
 	return 1;
 }
 
 /*
  * Process a received frame.  The node associated with the sender
  * should be supplied.  If nothing was found in the node table then
  * the caller is assumed to supply a reference to iv_bss instead.
  * The RSSI and a timestamp are also supplied.  The RSSI data is used
  * during AP scanning to select a AP to associate with; it can have
  * any units so long as values have consistent units and higher values
  * mean ``better signal''.  The receive timestamp is currently not used
  * by the 802.11 layer.
  */
 static int
 adhoc_input(struct ieee80211_node *ni, struct mbuf *m,
     const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_frame *wh;
 	struct ieee80211_key *key;
 	struct ether_header *eh;
 	int hdrspace, need_tap = 1;	/* mbuf need to be tapped. */	
 	uint8_t dir, type, subtype, qos;
 	uint8_t *bssid;
 	int is_hw_decrypted = 0;
 	int has_decrypted = 0;
 
 	/*
 	 * Some devices do hardware decryption all the way through
 	 * to pretending the frame wasn't encrypted in the first place.
 	 * So, tag it appropriately so it isn't discarded inappropriately.
 	 */
 	if ((rxs != NULL) && (rxs->c_pktflags & IEEE80211_RX_F_DECRYPTED))
 		is_hw_decrypted = 1;
 
 	if (m->m_flags & M_AMPDU_MPDU) {
 		/*
 		 * Fastpath for A-MPDU reorder q resubmission.  Frames
 		 * w/ M_AMPDU_MPDU marked have already passed through
 		 * here but were received out of order and been held on
 		 * the reorder queue.  When resubmitted they are marked
 		 * with the M_AMPDU_MPDU flag and we can bypass most of
 		 * the normal processing.
 		 */
 		wh = mtod(m, struct ieee80211_frame *);
 		type = IEEE80211_FC0_TYPE_DATA;
 		dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 		subtype = IEEE80211_FC0_SUBTYPE_QOS_DATA;
 		hdrspace = ieee80211_hdrspace(ic, wh);	/* XXX optimize? */
 		goto resubmit_ampdu;
 	}
 
 	KASSERT(ni != NULL, ("null node"));
 	ni->ni_inact = ni->ni_inact_reload;
 
 	type = -1;			/* undefined */
 
 	if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_min)) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL,
 		    "too short (1): len %u", m->m_pkthdr.len);
 		vap->iv_stats.is_rx_tooshort++;
 		goto out;
 	}
 	/*
 	 * Bit of a cheat here, we use a pointer for a 3-address
 	 * frame format but don't reference fields past outside
 	 * ieee80211_frame_min w/o first validating the data is
 	 * present.
 	 */
 	wh = mtod(m, struct ieee80211_frame *);
 
 	if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) !=
 	    IEEE80211_FC0_VERSION_0) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL, "wrong version, fc %02x:%02x",
 		    wh->i_fc[0], wh->i_fc[1]);
 		vap->iv_stats.is_rx_badversion++;
 		goto err;
 	}
 
 	dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 	type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK;
 	subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
 	if ((ic->ic_flags & IEEE80211_F_SCAN) == 0) {
 		if (dir != IEEE80211_FC1_DIR_NODS)
 			bssid = wh->i_addr1;
 		else if (type == IEEE80211_FC0_TYPE_CTL)
 			bssid = wh->i_addr1;
 		else {
 			if (m->m_pkthdr.len < sizeof(struct ieee80211_frame)) {
 				IEEE80211_DISCARD_MAC(vap,
 				    IEEE80211_MSG_ANY, ni->ni_macaddr,
 				    NULL, "too short (2): len %u",
 				    m->m_pkthdr.len);
 				vap->iv_stats.is_rx_tooshort++;
 				goto out;
 			}
 			bssid = wh->i_addr3;
 		}
 		/*
 		 * Validate the bssid.
 		 */
 		if (!(type == IEEE80211_FC0_TYPE_MGT &&
 		     (subtype == IEEE80211_FC0_SUBTYPE_BEACON ||
 		      subtype == IEEE80211_FC0_SUBTYPE_PROBE_RESP)) &&
 		    !IEEE80211_ADDR_EQ(bssid, vap->iv_bss->ni_bssid) &&
 		    !IEEE80211_ADDR_EQ(bssid, ifp->if_broadcastaddr)) {
 			/* not interested in */
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    bssid, NULL, "%s", "not to bss");
 			vap->iv_stats.is_rx_wrongbss++;
 			goto out;
 		}
 		/*
 		 * Data frame, cons up a node when it doesn't
 		 * exist. This should probably done after an ACL check.
 		 */
 		if (type == IEEE80211_FC0_TYPE_DATA &&
 		    ni == vap->iv_bss &&
 		    !IEEE80211_ADDR_EQ(wh->i_addr2, ni->ni_macaddr)) {
 			/*
 			 * Beware of frames that come in too early; we
 			 * can receive broadcast frames and creating sta
 			 * entries will blow up because there is no bss
 			 * channel yet.
 			 */
 			if (vap->iv_state != IEEE80211_S_RUN) {
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 				    wh, "data", "not in RUN state (%s)",
 				    ieee80211_state_name[vap->iv_state]);
 				vap->iv_stats.is_rx_badstate++;
 				goto err;
 			}
 			/*
 			 * Fake up a node for this newly discovered member
 			 * of the IBSS.
 			 *
 			 * Note: This doesn't "upgrade" the node to 11n;
 			 * that will happen after a probe request/response
 			 * exchange.
 			 */
 			ni = ieee80211_fakeup_adhoc_node(vap, wh->i_addr2);
 			if (ni == NULL) {
 				/* NB: stat kept for alloc failure */
 				goto err;
 			}
 		}
 		IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi);
 		ni->ni_noise = nf;
 		if (IEEE80211_HAS_SEQ(type, subtype) &&
 		    IEEE80211_ADDR_EQ(wh->i_addr2, ni->ni_macaddr)) {
 			uint8_t tid = ieee80211_gettid(wh);
 			if (IEEE80211_QOS_HAS_SEQ(wh) &&
 			    TID_TO_WME_AC(tid) >= WME_AC_VI)
 				ic->ic_wme.wme_hipri_traffic++;
 			if (! ieee80211_check_rxseq(ni, wh, bssid, rxs))
 				goto out;
 		}
 	}
 
 	switch (type) {
 	case IEEE80211_FC0_TYPE_DATA:
 		hdrspace = ieee80211_hdrspace(ic, wh);
 		if (m->m_len < hdrspace &&
 		    (m = m_pullup(m, hdrspace)) == NULL) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, NULL,
 			    "data too short: expecting %u", hdrspace);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;		/* XXX */
 		}
 		if (dir != IEEE80211_FC1_DIR_NODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto out;
 		}
 		/* XXX no power-save support */
 
 		/*
 		 * Handle A-MPDU re-ordering.  If the frame is to be
 		 * processed directly then ieee80211_ampdu_reorder
 		 * will return 0; otherwise it has consumed the mbuf
 		 * and we should do nothing more with it.
 		 */
 		if ((m->m_flags & M_AMPDU) &&
 		    ieee80211_ampdu_reorder(ni, m, rxs) != 0) {
 			m = NULL;
 			goto out;
 		}
 	resubmit_ampdu:
 
 		/*
 		 * Handle privacy requirements.  Note that we
 		 * must not be preempted from here until after
 		 * we (potentially) call ieee80211_crypto_demic;
 		 * otherwise we may violate assumptions in the
 		 * crypto cipher modules used to do delayed update
 		 * of replay sequence numbers.
 		 */
 		if (is_hw_decrypted || IEEE80211_IS_PROTECTED(wh)) {
 			if ((vap->iv_flags & IEEE80211_F_PRIVACY) == 0) {
 				/*
 				 * Discard encrypted frames when privacy is off.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 				    wh, "WEP", "%s", "PRIVACY off");
 				vap->iv_stats.is_rx_noprivacy++;
 				IEEE80211_NODE_STAT(ni, rx_noprivacy);
 				goto out;
 			}
 			if (ieee80211_crypto_decap(ni, m, hdrspace, &key) == 0) {
 				/* NB: stats+msgs handled in crypto_decap */
 				IEEE80211_NODE_STAT(ni, rx_wepfail);
 				goto out;
 			}
 			wh = mtod(m, struct ieee80211_frame *);
 			wh->i_fc[1] &= ~IEEE80211_FC1_PROTECTED;
 			has_decrypted = 1;
 		} else {
 			/* XXX M_WEP and IEEE80211_F_PRIVACY */
 			key = NULL;
 		}
 
 		/*
 		 * Save QoS bits for use below--before we strip the header.
 		 */
 		if (subtype == IEEE80211_FC0_SUBTYPE_QOS_DATA)
 			qos = ieee80211_getqos(wh)[0];
 		else
 			qos = 0;
 
 		/*
 		 * Next up, any fragmentation.
 		 */
 		if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			m = ieee80211_defrag(ni, m, hdrspace, has_decrypted);
 			if (m == NULL) {
 				/* Fragment dropped or frame not complete yet */
 				goto out;
 			}
 		}
 		wh = NULL;		/* no longer valid, catch any uses */
 
 		/*
 		 * Next strip any MSDU crypto bits.
 		 */
 		if (!ieee80211_crypto_demic(vap, key, m, 0)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "demic error");
 			vap->iv_stats.is_rx_demicfail++;
 			IEEE80211_NODE_STAT(ni, rx_demicfail);
 			goto out;
 		}
 
 		/* copy to listener after decrypt */
 		if (ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		need_tap = 0;
 
 		/*
 		 * Finally, strip the 802.11 header.
 		 */
 		m = ieee80211_decap(vap, m, hdrspace, qos);
 		if (m == NULL) {
 			/* XXX mask bit to check for both */
 			/* don't count Null data frames as errors */
 			if (subtype == IEEE80211_FC0_SUBTYPE_NODATA ||
 			    subtype == IEEE80211_FC0_SUBTYPE_QOS_NULL)
 				goto out;
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "decap error");
 			vap->iv_stats.is_rx_decap++;
 			IEEE80211_NODE_STAT(ni, rx_decap);
 			goto err;
 		}
 		if (!(qos & IEEE80211_QOS_AMSDU))
 			eh = mtod(m, struct ether_header *);
 		else
 			eh = NULL;
 		if (!ieee80211_node_is_authorized(ni)) {
 			/*
 			 * Deny any non-PAE frames received prior to
 			 * authorization.  For open/shared-key
 			 * authentication the port is mark authorized
 			 * after authentication completes.  For 802.1x
 			 * the port is not marked authorized by the
 			 * authenticator until the handshake has completed.
 			 */
 			if (eh == NULL ||
 			    eh->ether_type != htons(ETHERTYPE_PAE)) {
 				IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 				    ni->ni_macaddr, "data", "unauthorized or "
 				    "unknown port: ether type 0x%x len %u",
 				    eh == NULL ? -1 : eh->ether_type,
 				    m->m_pkthdr.len);
 				vap->iv_stats.is_rx_unauth++;
 				IEEE80211_NODE_STAT(ni, rx_unauth);
 				goto err;
 			}
 		} else {
 			/*
 			 * When denying unencrypted frames, discard
 			 * any non-PAE frames received without encryption.
 			 */
 			if ((vap->iv_flags & IEEE80211_F_DROPUNENC) &&
 			    ((has_decrypted == 0) && (m->m_flags & M_WEP) == 0) &&
 			    (is_hw_decrypted == 0) &&
 			    (eh == NULL ||
 			     eh->ether_type != htons(ETHERTYPE_PAE))) {
 				/*
 				 * Drop unencrypted frames.
 				 */
 				vap->iv_stats.is_rx_unencrypted++;
 				IEEE80211_NODE_STAT(ni, rx_unencrypted);
 				goto out;
 			}
 		}
 		/* XXX require HT? */
 		if (qos & IEEE80211_QOS_AMSDU) {
 			m = ieee80211_decap_amsdu(ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 		} else {
 #ifdef IEEE80211_SUPPORT_SUPERG
 			m = ieee80211_decap_fastframe(vap, ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 #endif
 		}
 		if (dir == IEEE80211_FC1_DIR_DSTODS && ni->ni_wdsvap != NULL)
 			ieee80211_deliver_data(ni->ni_wdsvap, ni, m);
 		else
 			ieee80211_deliver_data(vap, ni, m);
 		return IEEE80211_FC0_TYPE_DATA;
 
 	case IEEE80211_FC0_TYPE_MGT:
 		vap->iv_stats.is_rx_mgmt++;
 		IEEE80211_NODE_STAT(ni, rx_mgmt);
 		if (dir != IEEE80211_FC1_DIR_NODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto err;
 		}
 		if (m->m_pkthdr.len < sizeof(struct ieee80211_frame)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, "mgt", "too short: len %u",
 			    m->m_pkthdr.len);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;
 		}
 #ifdef IEEE80211_DEBUG
 		if ((ieee80211_msg_debug(vap) && doprint(vap, subtype)) ||
 		    ieee80211_msg_dumppkts(vap)) {
 			if_printf(ifp, "received %s from %s rssi %d\n",
 			    ieee80211_mgt_subtype_name(subtype),
 			    ether_sprintf(wh->i_addr2), rssi);
 		}
 #endif
 		if (IEEE80211_IS_PROTECTED(wh)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "WEP set but not permitted");
 			vap->iv_stats.is_rx_mgtdiscard++; /* XXX */
 			goto out;
 		}
 		vap->iv_recv_mgmt(ni, m, subtype, rxs, rssi, nf);
 		goto out;
 
 	case IEEE80211_FC0_TYPE_CTL:
 		vap->iv_stats.is_rx_ctl++;
 		IEEE80211_NODE_STAT(ni, rx_ctrl);
 		vap->iv_recv_ctl(ni, m, subtype);
 		goto out;
 
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "bad", "frame type 0x%x", type);
 		/* should not come here */
 		break;
 	}
 err:
 	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 out:
 	if (m != NULL) {
 		if (need_tap && ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		m_freem(m);
 	}
 	return type;
 }
 
 static int
 is11bclient(const uint8_t *rates, const uint8_t *xrates)
 {
 	static const uint32_t brates = (1<<2*1)|(1<<2*2)|(1<<11)|(1<<2*11);
 	int i;
 
 	/* NB: the 11b clients we care about will not have xrates */
 	if (xrates != NULL || rates == NULL)
 		return 0;
 	for (i = 0; i < rates[1]; i++) {
 		int r = rates[2+i] & IEEE80211_RATE_VAL;
 		if (r > 2*11 || ((1<<r) & brates) == 0)
 			return 0;
 	}
 	return 1;
 }
 
 static void
 adhoc_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0,
 	int subtype, const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_channel *rxchan = ic->ic_curchan;
 	struct ieee80211_frame *wh;
 	uint8_t *frm, *efrm;
 	uint8_t *ssid, *rates, *xrates;
 #if 0
 	int ht_state_change = 0;
 #endif
 
 	wh = mtod(m0, struct ieee80211_frame *);
 	frm = (uint8_t *)&wh[1];
 	efrm = mtod(m0, uint8_t *) + m0->m_len;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_INPUT | IEEE80211_MSG_DEBUG,
 	    "%s: recv mgmt frame, addr2=%6D, ni=%p (%6D) fc=%.02x %.02x\n",
 	    __func__,
 	    wh->i_addr2, ":",
 	    ni,
 	    ni->ni_macaddr, ":",
 	    wh->i_fc[0],
 	    wh->i_fc[1]);
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 	case IEEE80211_FC0_SUBTYPE_BEACON: {
 		struct ieee80211_scanparams scan;
 		struct ieee80211_channel *c;
 		/*
 		 * We process beacon/probe response
 		 * frames to discover neighbors.
 		 */ 
 		if (rxs != NULL) {
 			c = ieee80211_lookup_channel_rxstatus(vap, rxs);
 			if (c != NULL)
 				rxchan = c;
 		}
 		if (ieee80211_parse_beacon(ni, m0, rxchan, &scan) != 0)
 			return;
 		/*
 		 * Count frame now that we know it's to be processed.
 		 */
 		if (subtype == IEEE80211_FC0_SUBTYPE_BEACON) {
 			vap->iv_stats.is_rx_beacon++;		/* XXX remove */
 			IEEE80211_NODE_STAT(ni, rx_beacons);
 		} else
 			IEEE80211_NODE_STAT(ni, rx_proberesp);
 		/*
 		 * If scanning, just pass information to the scan module.
 		 */
 		if (ic->ic_flags & IEEE80211_F_SCAN) {
 			if (ic->ic_flags_ext & IEEE80211_FEXT_PROBECHAN) {
 				/*
 				 * Actively scanning a channel marked passive;
 				 * send a probe request now that we know there
 				 * is 802.11 traffic present.
 				 *
 				 * XXX check if the beacon we recv'd gives
 				 * us what we need and suppress the probe req
 				 */
 				ieee80211_probe_curchan(vap, 1);
 				ic->ic_flags_ext &= ~IEEE80211_FEXT_PROBECHAN;
 			}
 			ieee80211_add_scan(vap, rxchan, &scan, wh,
 			    subtype, rssi, nf);
 			return;
 		}
 		if (scan.capinfo & IEEE80211_CAPINFO_IBSS) {
 			if (!IEEE80211_ADDR_EQ(wh->i_addr2, ni->ni_macaddr)) {
 				/*
 				 * Create a new entry in the neighbor table.
 				 *
 				 * XXX TODO:
 				 *
 				 * Here we're not scanning; so if we have an
 				 * SSID then make sure it matches our SSID.
 				 * Otherwise this code will match on all IBSS
 				 * beacons/probe requests for all SSIDs,
 				 * filling the node table with nodes that
 				 * aren't ours.
 				 */
 				if (ieee80211_ibss_node_check_new(ni, &scan)) {
 					ni = ieee80211_add_neighbor(vap, wh, &scan);
 					/*
 					 * Send a probe request so we announce 11n
 					 * capabilities.
 					 */
 					ieee80211_send_probereq(ni, /* node */
 					    vap->iv_myaddr, /* SA */
 					    ni->ni_macaddr, /* DA */
 					    vap->iv_bss->ni_bssid, /* BSSID */
 					    vap->iv_bss->ni_essid,
 					    vap->iv_bss->ni_esslen); /* SSID */
 				} else
 					ni = NULL;
 
 				/*
 				 * Send a probe request so we announce 11n
 				 * capabilities.
 				 *
 				 * Don't do this if we're scanning.
 				 */
 				if (! (ic->ic_flags & IEEE80211_F_SCAN))
 					ieee80211_send_probereq(ni, /* node */
 						vap->iv_myaddr, /* SA */
 						ni->ni_macaddr, /* DA */
 						vap->iv_bss->ni_bssid, /* BSSID */
 						vap->iv_bss->ni_essid,
 						vap->iv_bss->ni_esslen); /* SSID */
 
 			} else if (ni->ni_capinfo == 0) {
 				/*
 				 * Update faked node created on transmit.
 				 * Note this also updates the tsf.
 				 */
 				ieee80211_init_neighbor(ni, wh, &scan);
 
 				/*
 				 * Send a probe request so we announce 11n
 				 * capabilities.
 				 */
 				ieee80211_send_probereq(ni, /* node */
 					vap->iv_myaddr, /* SA */
 					ni->ni_macaddr, /* DA */
 					vap->iv_bss->ni_bssid, /* BSSID */
 					vap->iv_bss->ni_essid,
 					vap->iv_bss->ni_esslen); /* SSID */
 			} else {
 				/*
 				 * Record tsf for potential resync.
 				 */
 				memcpy(ni->ni_tstamp.data, scan.tstamp,
 					sizeof(ni->ni_tstamp));
 			}
 			/*
 			 * This isn't enabled yet - otherwise it would
 			 * update the HT parameters and channel width
 			 * from any node, which could lead to lots of
 			 * strange behaviour if the 11n nodes aren't
 			 * exactly configured to match.
 			 */
 #if 0
 			if (scan.htcap != NULL && scan.htinfo != NULL &&
 			    (vap->iv_flags_ht & IEEE80211_FHT_HT)) {
 				ieee80211_ht_updateparams(ni,
 				    scan.htcap, scan.htinfo));
 				if (ieee80211_ht_updateparams_final(ni,
 				    scan.htcap, scan.htinfo))
 					ht_state_change = 1;
 			}
 
 			/* XXX same for VHT? */
 #endif
 			if (ni != NULL) {
 				IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi);
 				ni->ni_noise = nf;
 			}
 			/*
 			 * Same here - the channel width change should
 			 * be applied to the specific peer node, not
 			 * to the ic.  Ie, the interface configuration
 			 * should stay in its current channel width;
 			 * but it should change the rate control and
 			 * any queued frames for the given node only.
 			 *
 			 * Since there's no (current) way to inform
 			 * the driver that a channel width change has
 			 * occurred for a single node, just stub this
 			 * out.
 			 */
 #if 0
 			if (ht_state_change)
 				ieee80211_update_chw(ic);
 #endif
 		}
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 		if (vap->iv_state != IEEE80211_S_RUN) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "wrong state %s",
 			    ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_rx_mgtdiscard++;
 			return;
 		}
 		if (IEEE80211_IS_MULTICAST(wh->i_addr2)) {
 			/* frame must be directed */
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "not unicast");
 			vap->iv_stats.is_rx_mgtdiscard++;	/* XXX stat */
 			return;
 		}
 
 		/*
 		 * prreq frame format
 		 *	[tlv] ssid
 		 *	[tlv] supported rates
 		 *	[tlv] extended supported rates
 		 */
 		ssid = rates = xrates = NULL;
 		while (efrm - frm > 1) {
 			IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return);
 			switch (*frm) {
 			case IEEE80211_ELEMID_SSID:
 				ssid = frm;
 				break;
 			case IEEE80211_ELEMID_RATES:
 				rates = frm;
 				break;
 			case IEEE80211_ELEMID_XRATES:
 				xrates = frm;
 				break;
 			}
 			frm += frm[1] + 2;
 		}
 		IEEE80211_VERIFY_ELEMENT(rates, IEEE80211_RATE_MAXSIZE, return);
 		if (xrates != NULL)
 			IEEE80211_VERIFY_ELEMENT(xrates,
 				IEEE80211_RATE_MAXSIZE - rates[1], return);
 		IEEE80211_VERIFY_ELEMENT(ssid, IEEE80211_NWID_LEN, return);
 		IEEE80211_VERIFY_SSID(vap->iv_bss, ssid, return);
 		if ((vap->iv_flags & IEEE80211_F_HIDESSID) && ssid[1] == 0) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL,
 			    "%s", "no ssid with ssid suppression enabled");
 			vap->iv_stats.is_rx_ssidmismatch++; /*XXX*/
 			return;
 		}
 
 		/* XXX find a better class or define it's own */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_INPUT, wh->i_addr2,
 		    "%s", "recv probe req");
 		/*
 		 * Some legacy 11b clients cannot hack a complete
 		 * probe response frame.  When the request includes
 		 * only a bare-bones rate set, communicate this to
 		 * the transmit side.
 		 */
 		ieee80211_send_proberesp(vap, wh->i_addr2,
 		    is11bclient(rates, xrates) ? IEEE80211_SEND_LEGACY_11B : 0);
 
 		/*
 		 * Note: we don't benefit from stashing the probe request
 		 * IEs away to use for IBSS negotiation, because we
 		 * typically don't get all of the IEs.
 		 */
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ACTION:
 	case IEEE80211_FC0_SUBTYPE_ACTION_NOACK:
 		if ((ni == vap->iv_bss) &&
 		    !IEEE80211_ADDR_EQ(wh->i_addr2, ni->ni_macaddr)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "unknown node");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (!IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr1) &&
 		    !IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT | IEEE80211_MSG_DEBUG,
 			    wh, NULL, "%s", "not for us");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (vap->iv_state != IEEE80211_S_RUN) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT | IEEE80211_MSG_DEBUG,
 			    wh, NULL, "wrong state %s",
 			    ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else {
 			if (ieee80211_parse_action(ni, m0) == 0)
 				(void)ic->ic_recv_action(ni, wh, frm, efrm);
 		}
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_TIMING_ADV:
 	case IEEE80211_FC0_SUBTYPE_ATIM:
 	case IEEE80211_FC0_SUBTYPE_DISASSOC:
 	case IEEE80211_FC0_SUBTYPE_AUTH:
 	case IEEE80211_FC0_SUBTYPE_DEAUTH:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 		    wh, NULL, "%s", "not handled");
 		vap->iv_stats.is_rx_mgtdiscard++;
 		break;
 
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "mgt", "subtype 0x%x not handled", subtype);
 		vap->iv_stats.is_rx_badsubtype++;
 		break;
 	}
 }
 #undef IEEE80211_VERIFY_LENGTH
 #undef IEEE80211_VERIFY_ELEMENT
 
 static void
 ahdemo_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0,
 	int subtype, const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 
 	/*
 	 * Process management frames when scanning; useful for doing
 	 * a site-survey.
 	 */
 	if (ic->ic_flags & IEEE80211_F_SCAN)
 		adhoc_recv_mgmt(ni, m0, subtype, rxs, rssi, nf);
 	else {
 #ifdef IEEE80211_DEBUG
 		struct ieee80211_frame *wh;
 
 		wh = mtod(m0, struct ieee80211_frame *);
 #endif
 		switch (subtype) {
 		case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 		case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 		case IEEE80211_FC0_SUBTYPE_REASSOC_REQ:
 		case IEEE80211_FC0_SUBTYPE_REASSOC_RESP:
 		case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 		case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 		case IEEE80211_FC0_SUBTYPE_TIMING_ADV:
 		case IEEE80211_FC0_SUBTYPE_BEACON:
 		case IEEE80211_FC0_SUBTYPE_ATIM:
 		case IEEE80211_FC0_SUBTYPE_DISASSOC:
 		case IEEE80211_FC0_SUBTYPE_AUTH:
 		case IEEE80211_FC0_SUBTYPE_DEAUTH:
 		case IEEE80211_FC0_SUBTYPE_ACTION:
 		case IEEE80211_FC0_SUBTYPE_ACTION_NOACK:
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			     wh, NULL, "%s", "not handled");
 			vap->iv_stats.is_rx_mgtdiscard++;
 			break;
 		default:
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 			     wh, "mgt", "subtype 0x%x not handled", subtype);
 			vap->iv_stats.is_rx_badsubtype++;
 			break;
 		}
 	}
 }
 
 static void
 adhoc_recv_ctl(struct ieee80211_node *ni, struct mbuf *m, int subtype)
 {
 
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_BAR:
 		ieee80211_recv_bar(ni, m);
 		break;
 	}
 }
diff --git a/sys/net80211/ieee80211_ddb.c b/sys/net80211/ieee80211_ddb.c
index af4f8ea74799..b0e037d492fd 100644
--- a/sys/net80211/ieee80211_ddb.c
+++ b/sys/net80211/ieee80211_ddb.c
@@ -1,927 +1,927 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007-2009 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_wlan.h"
 
 #ifdef DDB
 /*
  * IEEE 802.11 DDB support
  */
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/ethernet.h>
 #include <net/vnet.h>
 
 #include <net80211/ieee80211_var.h>
 #ifdef IEEE80211_SUPPORT_TDMA
 #include <net80211/ieee80211_tdma.h>
 #endif
 #ifdef IEEE80211_SUPPORT_MESH
 #include <net80211/ieee80211_mesh.h>
 #endif
 
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 
 #define DB_PRINTSYM(prefix, name, addr) do { \
 	db_printf("%s%-25s : ",  prefix, name); \
 	db_printsym((db_addr_t) addr, DB_STGY_ANY); \
 	db_printf("\n"); \
 } while (0)
 
 static void _db_show_sta(const struct ieee80211_node *);
 static void _db_show_vap(const struct ieee80211vap *, int, int);
 static void _db_show_com(const struct ieee80211com *,
 	int showvaps, int showsta, int showmesh, int showprocs);
 
 static void _db_show_all_vaps(void *, struct ieee80211com *);
 
 static void _db_show_node_table(const char *tag,
 	const struct ieee80211_node_table *);
 static void _db_show_channel(const char *tag, const struct ieee80211_channel *);
 static void _db_show_ssid(const char *tag, int ix, int len, const uint8_t *);
 static void _db_show_appie(const char *tag, const struct ieee80211_appie *);
 static void _db_show_key(const char *tag, int ix, const struct ieee80211_key *);
 static void _db_show_roamparams(const char *tag, const void *arg,
 	const struct ieee80211_roamparam *rp);
 static void _db_show_txparams(const char *tag, const void *arg,
 	const struct ieee80211_txparam *tp);
 static void _db_show_ageq(const char *tag, const struct ieee80211_ageq *q);
 static void _db_show_stats(const struct ieee80211_stats *);
 #ifdef IEEE80211_SUPPORT_MESH
 static void _db_show_mesh(const struct ieee80211_mesh_state *);
 #endif
 
 DB_SHOW_COMMAND(sta, db_show_sta)
 {
 	if (!have_addr) {
 		db_printf("usage: show sta <addr>\n");
 		return;
 	}
 	_db_show_sta((const struct ieee80211_node *) addr);
 }
 
 DB_SHOW_COMMAND(statab, db_show_statab)
 {
 	if (!have_addr) {
 		db_printf("usage: show statab <addr>\n");
 		return;
 	}
 	_db_show_node_table("", (const struct ieee80211_node_table *) addr);
 }
 
 DB_SHOW_COMMAND(vap, db_show_vap)
 {
 	int i, showmesh = 0, showprocs = 0;
 
 	if (!have_addr) {
 		db_printf("usage: show vap <addr>\n");
 		return;
 	}
 	for (i = 0; modif[i] != '\0'; i++)
 		switch (modif[i]) {
 		case 'a':
 			showprocs = 1;
 			showmesh = 1;
 			break;
 		case 'm':
 			showmesh = 1;
 			break;
 		case 'p':
 			showprocs = 1;
 			break;
 		}
 	_db_show_vap((const struct ieee80211vap *) addr, showmesh, showprocs);
 }
 
 DB_SHOW_COMMAND(com, db_show_com)
 {
 	const struct ieee80211com *ic;
 	int i, showprocs = 0, showvaps = 0, showsta = 0, showmesh = 0;
 
 	if (!have_addr) {
 		db_printf("usage: show com <addr>\n");
 		return;
 	}
 	for (i = 0; modif[i] != '\0'; i++)
 		switch (modif[i]) {
 		case 'a':
 			showsta = showmesh = showvaps = showprocs = 1;
 			break;
 		case 's':
 			showsta = 1;
 			break;
 		case 'm':
 			showmesh = 1;
 			break;
 		case 'v':
 			showvaps = 1;
 			break;
 		case 'p':
 			showprocs = 1;
 			break;
 		}
 
 	ic = (const struct ieee80211com *) addr;
 	_db_show_com(ic, showvaps, showsta, showmesh, showprocs);
 }
 
 DB_SHOW_ALL_COMMAND(vaps, db_show_all_vaps)
 {
 	int i, showall = 0;
 
 	for (i = 0; modif[i] != '\0'; i++)
 		switch (modif[i]) {
 		case 'a':
 			showall = 1;
 			break;
 		}
 
 	ieee80211_iterate_coms(_db_show_all_vaps, &showall);
 }
 
 #ifdef IEEE80211_SUPPORT_MESH
 DB_SHOW_ALL_COMMAND(mesh, db_show_mesh)
 {
 	const struct ieee80211_mesh_state *ms;
 
 	if (!have_addr) {
 		db_printf("usage: show mesh <addr>\n");
 		return;
 	}
 	ms = (const struct ieee80211_mesh_state *) addr;
 	_db_show_mesh(ms);
 }
 #endif /* IEEE80211_SUPPORT_MESH */
 
 static void
 _db_show_txampdu(const char *sep, int ix, const struct ieee80211_tx_ampdu *tap)
 {
 	db_printf("%stxampdu[%d]: %p flags %b %s\n",
 		sep, ix, tap, tap->txa_flags, IEEE80211_AGGR_BITS,
 		ieee80211_wme_acnames[TID_TO_WME_AC(tap->txa_tid)]);
 	db_printf("%s  token %u lastsample %d pkts %d avgpps %d qbytes %d qframes %d\n",
 		sep, tap->txa_token, tap->txa_lastsample, tap->txa_pkts,
 		tap->txa_avgpps, tap->txa_qbytes, tap->txa_qframes);
 	db_printf("%s  start %u seqpending %u wnd %u attempts %d nextrequest %d\n",
 		sep, tap->txa_start, tap->txa_seqpending, tap->txa_wnd,
 		tap->txa_attempts, tap->txa_nextrequest);
 	/* XXX timer */
 }
 
 static void
 _db_show_rxampdu(const char *sep, int ix, const struct ieee80211_rx_ampdu *rap)
 {
 	struct mbuf *m;
 	int i;
 
 	db_printf("%srxampdu[%d]: %p flags 0x%x tid %u\n",
 		sep, ix, rap, rap->rxa_flags, ix /*XXX */);
 	db_printf("%s  qbytes %d qframes %d seqstart %u start %u wnd %u\n",
 		sep, rap->rxa_qbytes, rap->rxa_qframes,
 		rap->rxa_seqstart, rap->rxa_start, rap->rxa_wnd);
 	db_printf("%s  age %d nframes %d\n", sep,
 		rap->rxa_age, rap->rxa_nframes);
 	for (i = 0; i < IEEE80211_AGGR_BAWMAX; i++)
 		if (mbufq_len(&rap->rxa_mq[i]) > 0) {
 			db_printf("%s  m[%2u:%4u] ", sep, i,
 			    IEEE80211_SEQ_ADD(rap->rxa_start, i));
 			STAILQ_FOREACH(m, &rap->rxa_mq[i].mq_head,
 			    m_stailqpkt) {
 				db_printf(" %p", m);
 			}
 			db_printf("\n");
 		}
 }
 
 static void
 _db_show_sta(const struct ieee80211_node *ni)
 {
 	int i;
 
 	db_printf("STA: %p: mac %s refcnt %d\n", ni,
 		ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni));
 	db_printf("\tvap %p wdsvap %p ic %p table %p\n",
 		ni->ni_vap, ni->ni_wdsvap, ni->ni_ic, ni->ni_table);
 	db_printf("\tflags=%b\n", ni->ni_flags, IEEE80211_NODE_BITS);
 	db_printf("\tauthmode %u ath_flags 0x%x ath_defkeyix %u\n",
 		ni->ni_authmode, ni->ni_ath_flags, ni->ni_ath_defkeyix);
 	db_printf("\tassocid 0x%x txpower %u vlan %u\n",
 		ni->ni_associd, ni->ni_txpower, ni->ni_vlan);
 	db_printf("\tjointime %d (%lu secs) challenge %p\n",
 		ni->ni_jointime, (unsigned long)(time_uptime - ni->ni_jointime),
 		ni->ni_challenge);
 	db_printf("\ties: data %p len %d\n", ni->ni_ies.data, ni->ni_ies.len);
 	db_printf("\t[wpa_ie %p rsn_ie %p wme_ie %p ath_ie %p\n",
 		ni->ni_ies.wpa_ie, ni->ni_ies.rsn_ie, ni->ni_ies.wme_ie,
 		ni->ni_ies.ath_ie);
 	db_printf("\t htcap_ie %p htinfo_ie %p]\n",
 		ni->ni_ies.htcap_ie, ni->ni_ies.htinfo_ie);
 	db_printf("\t vhtcap_ie %p vhtopmode_ie %p vhtpwrenv_ie %p]\n",
 		ni->ni_ies.vhtcap_ie, ni->ni_ies.vhtopmode_ie,
 		ni->ni_ies.vhtpwrenv_ie);
 	if (ni->ni_flags & IEEE80211_NODE_QOS) {
 		for (i = 0; i < WME_NUM_TID; i++) {
 			if (ni->ni_txseqs[i] || ni->ni_rxseqs[i])
 				db_printf("\t[%u] txseq %u rxseq %u fragno %u\n",
 				    i, ni->ni_txseqs[i],
 				    ni->ni_rxseqs[i] >> IEEE80211_SEQ_SEQ_SHIFT,
 				    ni->ni_rxseqs[i] & IEEE80211_SEQ_FRAG_MASK);
 		}
 	}
 
 	db_printf("\ttxseq %u rxseq %u fragno %u rxfragstamp %u\n",
 		ni->ni_txseqs[IEEE80211_NONQOS_TID],
 		ni->ni_rxseqs[IEEE80211_NONQOS_TID] >> IEEE80211_SEQ_SEQ_SHIFT,
 		ni->ni_rxseqs[IEEE80211_NONQOS_TID] & IEEE80211_SEQ_FRAG_MASK,
 		ni->ni_rxfragstamp);
 	db_printf("\trxfrag[0] %p rxfrag[1] %p rxfrag[2] %p\n",
 		ni->ni_rxfrag[0], ni->ni_rxfrag[1], ni->ni_rxfrag[2]);
 	_db_show_key("\tucastkey", 0, &ni->ni_ucastkey);
 	db_printf("\tavgrssi 0x%x (rssi %d) noise %d\n",
 		ni->ni_avgrssi, IEEE80211_RSSI_GET(ni->ni_avgrssi),
 		ni->ni_noise);
 	db_printf("\tintval %u capinfo %b\n",
 		ni->ni_intval, ni->ni_capinfo, IEEE80211_CAPINFO_BITS);
 	db_printf("\tbssid %s", ether_sprintf(ni->ni_bssid));
 	_db_show_ssid(" essid ", 0, ni->ni_esslen, ni->ni_essid);
 	db_printf("\n");
 	_db_show_channel("\tchannel", ni->ni_chan);
 	db_printf("\n");
 	db_printf("\terp %b dtim_period %u dtim_count %u\n",
 		ni->ni_erp, IEEE80211_ERP_BITS,
 		ni->ni_dtim_period, ni->ni_dtim_count);
 
 	db_printf("\thtcap %b htparam 0x%x htctlchan %u ht2ndchan %u\n",
 		ni->ni_htcap, IEEE80211_HTCAP_BITS,
 		ni->ni_htparam, ni->ni_htctlchan, ni->ni_ht2ndchan);
 	db_printf("\thtopmode 0x%x htstbc 0x%x chw %u\n",
 		ni->ni_htopmode, ni->ni_htstbc, ni->ni_chw);
 
 	/* XXX ampdu state */
 	for (i = 0; i < WME_NUM_TID; i++)
 		if (ni->ni_tx_ampdu[i].txa_flags & IEEE80211_AGGR_SETUP)
 			_db_show_txampdu("\t", i, &ni->ni_tx_ampdu[i]);
 	for (i = 0; i < WME_NUM_TID; i++)
 		if (ni->ni_rx_ampdu[i].rxa_flags)
 			_db_show_rxampdu("\t", i, &ni->ni_rx_ampdu[i]);
 
 	db_printf("\tinact %u inact_reload %u txrate %u\n",
 		ni->ni_inact, ni->ni_inact_reload, ni->ni_txrate);
 #ifdef IEEE80211_SUPPORT_MESH
 	_db_show_ssid("\tmeshid ", 0, ni->ni_meshidlen, ni->ni_meshid);
 	db_printf(" mlstate %b mllid 0x%x mlpid 0x%x mlrcnt %u mltval %u\n",
 	    ni->ni_mlstate, IEEE80211_MESH_MLSTATE_BITS,
 	    ni->ni_mllid, ni->ni_mlpid, ni->ni_mlrcnt, ni->ni_mltval);
 #endif
 
 	/* VHT state */
 	db_printf("\tvhtcap %b vht_basicmcs %#06x vht_pad2 %#06x\n",
 	    ni->ni_vhtcap, IEEE80211_VHTCAP_BITS,
 	    ni->ni_vht_basicmcs, ni->ni_vht_pad2);
 	db_printf("\tvht_mcsinfo: { rx_mcs_map %#06x rx_highest %#06x "
 	    "tx_mcs_map %#06x tx_highest %#06x }\n",
 	    ni->ni_vht_mcsinfo.rx_mcs_map, ni->ni_vht_mcsinfo.rx_highest,
 	    ni->ni_vht_mcsinfo.tx_mcs_map, ni->ni_vht_mcsinfo.tx_highest);
 	db_printf("\tvht_chan1/chan2 %u/%u vht_chanwidth %#04x\n",
 	    ni->ni_vht_chan1, ni->ni_vht_chan2, ni->ni_vht_chanwidth);
 	db_printf("\tvht_pad1 %#04x vht_spare { %#x %#x %#x %#x %#x %#x %#x %#x }\n",
 	    ni->ni_vht_pad1, ni->ni_vht_spare[0], ni->ni_vht_spare[1],
 	    ni->ni_vht_spare[2], ni->ni_vht_spare[3], ni->ni_vht_spare[4],
 	    ni->ni_vht_spare[5], ni->ni_vht_spare[6], ni->ni_vht_spare[7]);
 }
 
 #ifdef IEEE80211_SUPPORT_TDMA
 static void
 _db_show_tdma(const char *sep, const struct ieee80211_tdma_state *ts, int showprocs)
 {
 	db_printf("%stdma %p:\n", sep, ts);
 	db_printf("%s  version %u slot %u bintval %u peer %p\n", sep,
 	    ts->tdma_version, ts->tdma_slot, ts->tdma_bintval, ts->tdma_peer);
 	db_printf("%s  slotlen %u slotcnt %u", sep,
 	    ts->tdma_slotlen, ts->tdma_slotcnt);
 	db_printf(" inuse 0x%x active 0x%x count %d\n",
 	    ts->tdma_inuse[0], ts->tdma_active[0], ts->tdma_count);
 	if (showprocs) {
 		DB_PRINTSYM(sep, "  tdma_newstate", ts->tdma_newstate);
 		DB_PRINTSYM(sep, "  tdma_recv_mgmt", ts->tdma_recv_mgmt);
 		DB_PRINTSYM(sep, "  tdma_opdetach", ts->tdma_opdetach);
 	}
 }
 #endif /* IEEE80211_SUPPORT_TDMA */
 
 static void
 _db_show_vap(const struct ieee80211vap *vap, int showmesh, int showprocs)
 {
 	const struct ieee80211com *ic = vap->iv_ic;
 	int i;
 
 	db_printf("VAP %p:", vap);
 	db_printf(" bss %p", vap->iv_bss);
 	db_printf(" myaddr %s", ether_sprintf(vap->iv_myaddr));
 	db_printf("\n");
 
 	db_printf("\topmode %s", ieee80211_opmode_name[vap->iv_opmode]);
 #ifdef IEEE80211_SUPPORT_MESH
 	if (vap->iv_opmode == IEEE80211_M_MBSS)
 		db_printf("(%p)", vap->iv_mesh);
 #endif
 	db_printf(" state %s", ieee80211_state_name[vap->iv_state]);
-	db_printf(" ifp %p(%s)", vap->iv_ifp, vap->iv_ifp->if_xname);
+	db_printf(" ifp %p(%s)", vap->iv_ifp, if_name(vap->iv_ifp));
 	db_printf("\n");
 
 	db_printf("\tic %p", vap->iv_ic);
 	db_printf(" media %p", &vap->iv_media);
 	db_printf(" bpf_if %p", vap->iv_rawbpf);
 	db_printf(" mgtsend %p", &vap->iv_mgtsend);
 #if 0
 	struct sysctllog	*iv_sysctl;	/* dynamic sysctl context */
 #endif
 	db_printf("\n");
 	db_printf("\tdebug=%b\n", vap->iv_debug, IEEE80211_MSG_BITS);
 
 	db_printf("\tflags=%b\n", vap->iv_flags, IEEE80211_F_BITS);
 	db_printf("\tflags_ext=%b\n", vap->iv_flags_ext, IEEE80211_FEXT_BITS);
 	db_printf("\tflags_ht=%b\n", vap->iv_flags_ht, IEEE80211_FHT_BITS);
 	db_printf("\tflags_ven=%b\n", vap->iv_flags_ven, IEEE80211_FVEN_BITS);
 	db_printf("\tcaps=%b\n", vap->iv_caps, IEEE80211_C_BITS);
 	db_printf("\thtcaps=%b\n", vap->iv_htcaps, IEEE80211_C_HTCAP_BITS);
 	db_printf("\tvhtcaps=%b\n", vap->iv_vhtcaps, IEEE80211_VHTCAP_BITS);
 
 	_db_show_stats(&vap->iv_stats);
 
 	db_printf("\tinact_init %d", vap->iv_inact_init);
 	db_printf(" inact_auth %d", vap->iv_inact_auth);
 	db_printf(" inact_run %d", vap->iv_inact_run);
 	db_printf(" inact_probe %d", vap->iv_inact_probe);
 	db_printf("\n");
 
 	db_printf("\tdes_nssid %d", vap->iv_des_nssid);
 	if (vap->iv_des_nssid)
 		_db_show_ssid(" des_ssid[%u] ", 0,
 		    vap->iv_des_ssid[0].len, vap->iv_des_ssid[0].ssid);
 	db_printf(" des_bssid %s", ether_sprintf(vap->iv_des_bssid));
 	db_printf("\n");
 	db_printf("\tdes_mode %d", vap->iv_des_mode);
 	_db_show_channel(" des_chan", vap->iv_des_chan);
 	db_printf("\n");
 #if 0
 	int			iv_nicknamelen;	/* XXX junk */
 	uint8_t			iv_nickname[IEEE80211_NWID_LEN];
 #endif
 	db_printf("\tbgscanidle %u", vap->iv_bgscanidle);
 	db_printf(" bgscanintvl %u", vap->iv_bgscanintvl);
 	db_printf(" scanvalid %u", vap->iv_scanvalid);
 	db_printf("\n");
 	db_printf("\tscanreq_duration %u", vap->iv_scanreq_duration);
 	db_printf(" scanreq_mindwell %u", vap->iv_scanreq_mindwell);
 	db_printf(" scanreq_maxdwell %u", vap->iv_scanreq_maxdwell);
 	db_printf("\n");
 	db_printf("\tscanreq_flags 0x%x", vap->iv_scanreq_flags);
 	db_printf(" scanreq_nssid %d", vap->iv_scanreq_nssid);
 	for (i = 0; i < vap->iv_scanreq_nssid; i++)
 		_db_show_ssid(" scanreq_ssid[%u]", i,
 		    vap->iv_scanreq_ssid[i].len, vap->iv_scanreq_ssid[i].ssid);
 	db_printf(" roaming %d", vap->iv_roaming);
 	db_printf("\n");
 	for (i = IEEE80211_MODE_11A; i < IEEE80211_MODE_MAX; i++)
 		if (isset(ic->ic_modecaps, i)) {
 			_db_show_roamparams("\troamparms[%s]",
 			    ieee80211_phymode_name[i], &vap->iv_roamparms[i]);
 			db_printf("\n");
 		}
 
 	db_printf("\tbmissthreshold %u", vap->iv_bmissthreshold);
 	db_printf(" bmiss_max %u", vap->iv_bmiss_count);
 	db_printf(" bmiss_max %d", vap->iv_bmiss_max);
 	db_printf("\n");
 	db_printf("\tswbmiss_count %u", vap->iv_swbmiss_count);
 	db_printf(" swbmiss_period %u", vap->iv_swbmiss_period);
 	db_printf(" swbmiss %p", &vap->iv_swbmiss);
 	db_printf("\n");
 
 	db_printf("\tampdu_rxmax %d", vap->iv_ampdu_rxmax);
 	db_printf(" ampdu_density %d", vap->iv_ampdu_density);
 	db_printf(" ampdu_limit %d", vap->iv_ampdu_limit);
 	db_printf(" amsdu_limit %d", vap->iv_amsdu_limit);
 	db_printf("\n");
 
 	db_printf("\tmax_aid %u", vap->iv_max_aid);
 	db_printf(" aid_bitmap %p", vap->iv_aid_bitmap);
 	db_printf("\n");
 	db_printf("\tsta_assoc %u", vap->iv_sta_assoc);
 	db_printf(" ps_sta %u", vap->iv_ps_sta);
 	db_printf(" ps_pending %u", vap->iv_ps_pending);
 	db_printf(" tim_len %u", vap->iv_tim_len);
 	db_printf(" tim_bitmap %p", vap->iv_tim_bitmap);
 	db_printf("\n");
 	db_printf("\tdtim_period %u", vap->iv_dtim_period);
 	db_printf(" dtim_count %u", vap->iv_dtim_count);
 	db_printf(" set_tim %p", vap->iv_set_tim);
 	db_printf(" csa_count %d", vap->iv_csa_count);
 	db_printf("\n");
 
 	db_printf("\trtsthreshold %u", vap->iv_rtsthreshold);
 	db_printf(" fragthreshold %u", vap->iv_fragthreshold);
 	db_printf(" inact_timer %d", vap->iv_inact_timer);
 	db_printf("\n");
 	for (i = IEEE80211_MODE_11A; i < IEEE80211_MODE_MAX; i++)
 		if (isset(ic->ic_modecaps, i)) {
 			_db_show_txparams("\ttxparms[%s]",
 			    ieee80211_phymode_name[i], &vap->iv_txparms[i]);
 			db_printf("\n");
 		}
 
 	/* application-specified IE's to attach to mgt frames */
 	_db_show_appie("\tappie_beacon", vap->iv_appie_beacon);
 	_db_show_appie("\tappie_probereq", vap->iv_appie_probereq);
 	_db_show_appie("\tappie_proberesp", vap->iv_appie_proberesp);
 	_db_show_appie("\tappie_assocreq", vap->iv_appie_assocreq);
 	_db_show_appie("\tappie_asscoresp", vap->iv_appie_assocresp);
 	_db_show_appie("\tappie_wpa", vap->iv_appie_wpa);
 	if (vap->iv_wpa_ie != NULL || vap->iv_rsn_ie != NULL) {
 		if (vap->iv_wpa_ie != NULL)
 			db_printf("\twpa_ie %p", vap->iv_wpa_ie);
 		if (vap->iv_rsn_ie != NULL)
 			db_printf("\trsn_ie %p", vap->iv_rsn_ie);
 		db_printf("\n");
 	}
 	db_printf("\tmax_keyix %u", vap->iv_max_keyix);
 	db_printf(" def_txkey %d", vap->iv_def_txkey);
 	db_printf("\n");
 	for (i = 0; i < IEEE80211_WEP_NKID; i++)
 		_db_show_key("\tnw_keys[%u]", i, &vap->iv_nw_keys[i]);
 
 	db_printf("\tauth %p(%s)", vap->iv_auth, vap->iv_auth->ia_name);
 	db_printf(" ec %p", vap->iv_ec);
 
 	db_printf(" acl %p", vap->iv_acl);
 	db_printf(" as %p", vap->iv_as);
 	db_printf("\n");
 #ifdef IEEE80211_SUPPORT_MESH
 	if (showmesh && vap->iv_mesh != NULL)
 		_db_show_mesh(vap->iv_mesh);
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 	if (vap->iv_tdma != NULL)
 		_db_show_tdma("\t", vap->iv_tdma, showprocs);
 #endif /* IEEE80211_SUPPORT_TDMA */
 
 	db_printf("\tsta_assoc %u", vap->iv_sta_assoc);
 	db_printf(" ht_sta_assoc %u", vap->iv_ht_sta_assoc);
 	db_printf(" ht40_sta_assoc %u", vap->iv_ht40_sta_assoc);
 	db_printf("\n");
 	db_printf(" nonerpsta %u", vap->iv_nonerpsta);
 	db_printf(" longslotsta %u", vap->iv_longslotsta);
 	db_printf(" lastnonerp %d", vap->iv_lastnonerp);
 	db_printf(" lastnonht %d", vap->iv_lastnonht);
 	db_printf("\n");
 
 	if (showprocs) {
 		DB_PRINTSYM("\t", "iv_key_alloc", vap->iv_key_alloc);
 		DB_PRINTSYM("\t", "iv_key_delete", vap->iv_key_delete);
 		DB_PRINTSYM("\t", "iv_key_set", vap->iv_key_set);
 		DB_PRINTSYM("\t", "iv_key_update_begin", vap->iv_key_update_begin);
 		DB_PRINTSYM("\t", "iv_key_update_end", vap->iv_key_update_end);
 		DB_PRINTSYM("\t", "iv_opdetach", vap->iv_opdetach);
 		DB_PRINTSYM("\t", "iv_input", vap->iv_input);
 		DB_PRINTSYM("\t", "iv_recv_mgmt", vap->iv_recv_mgmt);
 		DB_PRINTSYM("\t", "iv_deliver_data", vap->iv_deliver_data);
 		DB_PRINTSYM("\t", "iv_bmiss", vap->iv_bmiss);
 		DB_PRINTSYM("\t", "iv_reset", vap->iv_reset);
 		DB_PRINTSYM("\t", "iv_update_beacon", vap->iv_update_beacon);
 		DB_PRINTSYM("\t", "iv_newstate", vap->iv_newstate);
 		DB_PRINTSYM("\t", "iv_output", vap->iv_output);
 	}
 }
 
 static void
 _db_show_com(const struct ieee80211com *ic, int showvaps, int showsta,
     int showmesh, int showprocs)
 {
 	struct ieee80211vap *vap;
 
 	db_printf("COM: %p:", ic);
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
-		db_printf(" %s(%p)", vap->iv_ifp->if_xname, vap);
+		db_printf(" %s(%p)", if_name(vap->iv_ifp), vap);
 	db_printf("\n");
 	db_printf("\tsoftc %p", ic->ic_softc);
 	db_printf("\tname %s", ic->ic_name);
 	db_printf(" comlock %p", &ic->ic_comlock);
 	db_printf(" txlock %p", &ic->ic_txlock);
 	db_printf(" fflock %p", &ic->ic_fflock);
 	db_printf("\n");
 	db_printf("\theadroom %d", ic->ic_headroom);
 	db_printf(" phytype %d", ic->ic_phytype);
 	db_printf(" opmode %s", ieee80211_opmode_name[ic->ic_opmode]);
 	db_printf("\n");
 	db_printf(" inact %p", &ic->ic_inact);
 	db_printf("\n");
 
 	db_printf("\tflags=%b\n", ic->ic_flags, IEEE80211_F_BITS);
 	db_printf("\tflags_ext=%b\n", ic->ic_flags_ext, IEEE80211_FEXT_BITS);
 	db_printf("\tflags_ht=%b\n", ic->ic_flags_ht, IEEE80211_FHT_BITS);
 	db_printf("\tflags_ven=%b\n", ic->ic_flags_ven, IEEE80211_FVEN_BITS);
 	db_printf("\tcaps=%b\n", ic->ic_caps, IEEE80211_C_BITS);
 	db_printf("\tcryptocaps=%b\n",
 	    ic->ic_cryptocaps, IEEE80211_CRYPTO_BITS);
 	db_printf("\thtcaps=%b\n", ic->ic_htcaps, IEEE80211_HTCAP_BITS);
 	db_printf("\tvhtcaps=%b\n", ic->ic_vhtcaps, IEEE80211_VHTCAP_BITS);
 
 #if 0
 	uint8_t			ic_modecaps[2];	/* set of mode capabilities */
 #endif
 	db_printf("\tcurmode %u", ic->ic_curmode);
 	db_printf(" promisc %u", ic->ic_promisc);
 	db_printf(" allmulti %u", ic->ic_allmulti);
 	db_printf(" nrunning %u", ic->ic_nrunning);
 	db_printf("\n");
 	db_printf("\tbintval %u", ic->ic_bintval);
 	db_printf(" lintval %u", ic->ic_lintval);
 	db_printf(" holdover %u", ic->ic_holdover);
 	db_printf(" txpowlimit %u", ic->ic_txpowlimit);
 	db_printf("\n");
 #if 0
 	struct ieee80211_rateset ic_sup_rates[IEEE80211_MODE_MAX];
 #endif
 	/*
 	 * Channel state:
 	 *
 	 * ic_channels is the set of available channels for the device;
 	 *    it is setup by the driver
 	 * ic_nchans is the number of valid entries in ic_channels
 	 * ic_chan_avail is a bit vector of these channels used to check
 	 *    whether a channel is available w/o searching the channel table.
 	 * ic_chan_active is a (potentially) constrained subset of
 	 *    ic_chan_avail that reflects any mode setting or user-specified
 	 *    limit on the set of channels to use/scan
 	 * ic_curchan is the current channel the device is set to; it may
 	 *    be different from ic_bsschan when we are off-channel scanning
 	 *    or otherwise doing background work
 	 * ic_bsschan is the channel selected for operation; it may
 	 *    be undefined (IEEE80211_CHAN_ANYC)
 	 * ic_prevchan is a cached ``previous channel'' used to optimize
 	 *    lookups when switching back+forth between two channels
 	 *    (e.g. for dynamic turbo)
 	 */
 	db_printf("\tnchans %d", ic->ic_nchans);
 #if 0
 	struct ieee80211_channel ic_channels[IEEE80211_CHAN_MAX];
 	uint8_t			ic_chan_avail[IEEE80211_CHAN_BYTES];
 	uint8_t			ic_chan_active[IEEE80211_CHAN_BYTES];
 	uint8_t			ic_chan_scan[IEEE80211_CHAN_BYTES];
 #endif
 	db_printf("\n");
 	_db_show_channel("\tcurchan", ic->ic_curchan);
 	db_printf("\n");
 	_db_show_channel("\tbsschan", ic->ic_bsschan);
 	db_printf("\n");
 	_db_show_channel("\tprevchan", ic->ic_prevchan);
 	db_printf("\n");
 	db_printf("\tregdomain %p", &ic->ic_regdomain);
 	db_printf("\n");
 
 	_db_show_channel("\tcsa_newchan", ic->ic_csa_newchan);
 	db_printf(" csa_count %d", ic->ic_csa_count);
 	db_printf( "dfs %p", &ic->ic_dfs);
 	db_printf("\n");
 
 	db_printf("\tscan %p", ic->ic_scan);
 	db_printf(" lastdata %d", ic->ic_lastdata);
 	db_printf(" lastscan %d", ic->ic_lastscan);
 	db_printf("\n");
 
 	db_printf("\tmax_keyix %d", ic->ic_max_keyix);
 	db_printf(" hash_key 0x%x", ic->ic_hash_key);
 	db_printf(" wme %p", &ic->ic_wme);
 	if (!showsta)
 		db_printf(" sta %p", &ic->ic_sta);
 	db_printf("\n");
 	db_printf("\tstageq@%p:\n", &ic->ic_stageq);
 	_db_show_ageq("\t", &ic->ic_stageq);
 	if (showsta)
 		_db_show_node_table("\t", &ic->ic_sta);
 
 	db_printf("\tprotmode %d", ic->ic_protmode);
 	db_printf("\tcurhtprotmode 0x%x", ic->ic_curhtprotmode);
 	db_printf(" htprotmode %d", ic->ic_htprotmode);
 	db_printf("\n");
 
 	db_printf("\tsuperg %p\n", ic->ic_superg);
 
 	db_printf("\tmontaps %d th %p txchan %p rh %p rxchan %p\n",
 	    ic->ic_montaps, ic->ic_th, ic->ic_txchan, ic->ic_rh, ic->ic_rxchan);
 
 	if (showprocs) {
 		DB_PRINTSYM("\t", "ic_vap_create", ic->ic_vap_create);
 		DB_PRINTSYM("\t", "ic_vap_delete", ic->ic_vap_delete);
 #if 0
 		/* operating mode attachment */
 		ieee80211vap_attach	ic_vattach[IEEE80211_OPMODE_MAX];
 #endif
 		DB_PRINTSYM("\t", "ic_newassoc", ic->ic_newassoc);
 		DB_PRINTSYM("\t", "ic_getradiocaps", ic->ic_getradiocaps);
 		DB_PRINTSYM("\t", "ic_setregdomain", ic->ic_setregdomain);
 		DB_PRINTSYM("\t", "ic_send_mgmt", ic->ic_send_mgmt);
 		DB_PRINTSYM("\t", "ic_raw_xmit", ic->ic_raw_xmit);
 		DB_PRINTSYM("\t", "ic_updateslot", ic->ic_updateslot);
 		DB_PRINTSYM("\t", "ic_update_mcast", ic->ic_update_mcast);
 		DB_PRINTSYM("\t", "ic_update_promisc", ic->ic_update_promisc);
 		DB_PRINTSYM("\t", "ic_node_alloc", ic->ic_node_alloc);
 		DB_PRINTSYM("\t", "ic_node_free", ic->ic_node_free);
 		DB_PRINTSYM("\t", "ic_node_cleanup", ic->ic_node_cleanup);
 		DB_PRINTSYM("\t", "ic_node_getrssi", ic->ic_node_getrssi);
 		DB_PRINTSYM("\t", "ic_node_getsignal", ic->ic_node_getsignal);
 		DB_PRINTSYM("\t", "ic_node_getmimoinfo", ic->ic_node_getmimoinfo);
 		DB_PRINTSYM("\t", "ic_scan_start", ic->ic_scan_start);
 		DB_PRINTSYM("\t", "ic_scan_end", ic->ic_scan_end);
 		DB_PRINTSYM("\t", "ic_set_channel", ic->ic_set_channel);
 		DB_PRINTSYM("\t", "ic_scan_curchan", ic->ic_scan_curchan);
 		DB_PRINTSYM("\t", "ic_scan_mindwell", ic->ic_scan_mindwell);
 		DB_PRINTSYM("\t", "ic_recv_action", ic->ic_recv_action);
 		DB_PRINTSYM("\t", "ic_send_action", ic->ic_send_action);
 		DB_PRINTSYM("\t", "ic_addba_request", ic->ic_addba_request);
 		DB_PRINTSYM("\t", "ic_addba_response", ic->ic_addba_response);
 		DB_PRINTSYM("\t", "ic_addba_stop", ic->ic_addba_stop);
 	}
 	if (showvaps && !TAILQ_EMPTY(&ic->ic_vaps)) {
 		db_printf("\n");
 		TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
 			_db_show_vap(vap, showmesh, showprocs);
 	}
 	if (showsta && !TAILQ_EMPTY(&ic->ic_sta.nt_node)) {
 		const struct ieee80211_node_table *nt = &ic->ic_sta;
 		const struct ieee80211_node *ni;
 
 		TAILQ_FOREACH(ni, &nt->nt_node, ni_list) {
 			db_printf("\n");
 			_db_show_sta(ni);
 		}
 	}
 }
 
 static void
 _db_show_all_vaps(void *arg, struct ieee80211com *ic)
 {
 	int showall = *(int *)arg;
 
 	if (!showall) {
 		const struct ieee80211vap *vap;
 		db_printf("%s: com %p vaps:", ic->ic_name, ic);
 		TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
-			db_printf(" %s(%p)", vap->iv_ifp->if_xname, vap);
+			db_printf(" %s(%p)", if_name(vap->iv_ifp), vap);
 		db_printf("\n");
 	} else
 		_db_show_com(ic, 1, 1, 1, 1);
 }
 
 static void
 _db_show_node_table(const char *tag, const struct ieee80211_node_table *nt)
 {
 	int i;
 
 	db_printf("%s%s@%p:\n", tag, nt->nt_name, nt);
 	db_printf("%s nodelock %p", tag, &nt->nt_nodelock);
 	db_printf(" inact_init %d", nt->nt_inact_init);
 	db_printf("%s keyixmax %d keyixmap %p\n",
 	    tag, nt->nt_keyixmax, nt->nt_keyixmap);
 	for (i = 0; i < nt->nt_keyixmax; i++) {
 		const struct ieee80211_node *ni = nt->nt_keyixmap[i];
 		if (ni != NULL)
 			db_printf("%s [%3u] %p %s\n", tag, i, ni,
 			    ether_sprintf(ni->ni_macaddr));
 	}
 }
 
 static void
 _db_show_channel(const char *tag, const struct ieee80211_channel *c)
 {
 	db_printf("%s ", tag);
 	if (c == NULL)
 		db_printf("<NULL>");
 	else if (c == IEEE80211_CHAN_ANYC)
 		db_printf("<ANY>");
 	else
 		db_printf("[%u (%u) flags=%b maxreg %d maxpow %d minpow %d state 0x%x extieee %u]",
 		    c->ic_freq, c->ic_ieee,
 		    c->ic_flags, IEEE80211_CHAN_BITS,
 		    c->ic_maxregpower, c->ic_maxpower, c->ic_minpower,
 		    c->ic_state, c->ic_extieee);
 }
 
 static void
 _db_show_ssid(const char *tag, int ix, int len, const uint8_t *ssid)
 {
 	const uint8_t *p;
 	int i;
 
 	db_printf(tag, ix);
 
 	if (len > IEEE80211_NWID_LEN)
 		len = IEEE80211_NWID_LEN;
 	/* determine printable or not */
 	for (i = 0, p = ssid; i < len; i++, p++) {
 		if (*p < ' ' || *p > 0x7e)
 			break;
 	}
 	if (i == len) {
 		db_printf("\"");
 		for (i = 0, p = ssid; i < len; i++, p++)
 			db_printf("%c", *p);
 		db_printf("\"");
 	} else {
 		db_printf("0x");
 		for (i = 0, p = ssid; i < len; i++, p++)
 			db_printf("%02x", *p);
 	}
 }
 
 static void
 _db_show_appie(const char *tag, const struct ieee80211_appie *ie)
 {
 	const uint8_t *p;
 	int i;
 
 	if (ie == NULL)
 		return;
 	db_printf("%s [0x", tag);
 	for (i = 0, p = ie->ie_data; i < ie->ie_len; i++, p++)
 		db_printf("%02x", *p);
 	db_printf("]\n");
 }
 
 static void
 _db_show_key(const char *tag, int ix, const struct ieee80211_key *wk)
 {
 	static const uint8_t zerodata[IEEE80211_KEYBUF_SIZE];
 	const struct ieee80211_cipher *cip = wk->wk_cipher;
 	int keylen = wk->wk_keylen;
 
 	db_printf(tag, ix);
 	switch (cip->ic_cipher) {
 	case IEEE80211_CIPHER_WEP:
 		/* compatibility */
 		db_printf(" wepkey %u:%s", wk->wk_keyix,
 		    keylen <= 5 ? "40-bit" :
 		    keylen <= 13 ? "104-bit" : "128-bit");
 		break;
 	case IEEE80211_CIPHER_TKIP:
 		if (keylen > 128/8)
 			keylen -= 128/8;	/* ignore MIC for now */
 		db_printf(" TKIP %u:%u-bit", wk->wk_keyix, 8*keylen);
 		break;
 	case IEEE80211_CIPHER_AES_OCB:
 		db_printf(" AES-OCB %u:%u-bit", wk->wk_keyix, 8*keylen);
 		break;
 	case IEEE80211_CIPHER_AES_CCM:
 		db_printf(" AES-CCM %u:%u-bit", wk->wk_keyix, 8*keylen);
 		break;
 	case IEEE80211_CIPHER_CKIP:
 		db_printf(" CKIP %u:%u-bit", wk->wk_keyix, 8*keylen);
 		break;
 	case IEEE80211_CIPHER_NONE:
 		db_printf(" NULL %u:%u-bit", wk->wk_keyix, 8*keylen);
 		break;
 	default:
 		db_printf(" UNKNOWN (0x%x) %u:%u-bit",
 			cip->ic_cipher, wk->wk_keyix, 8*keylen);
 		break;
 	}
 	if (wk->wk_rxkeyix != wk->wk_keyix)
 		db_printf(" rxkeyix %u", wk->wk_rxkeyix);
 	if (memcmp(wk->wk_key, zerodata, keylen) != 0) {
 		int i;
 
 		db_printf(" <");
 		for (i = 0; i < keylen; i++)
 			db_printf("%02x", wk->wk_key[i]);
 		db_printf(">");
 		if (cip->ic_cipher != IEEE80211_CIPHER_WEP &&
 		    wk->wk_keyrsc[IEEE80211_NONQOS_TID] != 0)
 			db_printf(" rsc %ju", (uintmax_t)wk->wk_keyrsc[IEEE80211_NONQOS_TID]);
 		if (cip->ic_cipher != IEEE80211_CIPHER_WEP &&
 		    wk->wk_keytsc != 0)
 			db_printf(" tsc %ju", (uintmax_t)wk->wk_keytsc);
 		db_printf(" flags=%b", wk->wk_flags, IEEE80211_KEY_BITS);
 	}
 	db_printf("\n");
 }
 
 static void
 printrate(const char *tag, int v)
 {
 	if (v == IEEE80211_FIXED_RATE_NONE)
 		db_printf(" %s <none>", tag);
 	else if (v == 11)
 		db_printf(" %s 5.5", tag);
 	else if (v & IEEE80211_RATE_MCS)
 		db_printf(" %s MCS%d", tag, v &~ IEEE80211_RATE_MCS);
 	else
 		db_printf(" %s %d", tag, v/2);
 }
 
 static void
 _db_show_roamparams(const char *tag, const void *arg,
     const struct ieee80211_roamparam *rp)
 {
 
 	db_printf(tag, arg);
 	if (rp->rssi & 1)
 		db_printf(" rssi %u.5", rp->rssi/2);
 	else
 		db_printf(" rssi %u", rp->rssi/2);
 	printrate("rate", rp->rate);
 }
 
 static void
 _db_show_txparams(const char *tag, const void *arg,
     const struct ieee80211_txparam *tp)
 {
 
 	db_printf(tag, arg);
 	printrate("ucastrate", tp->ucastrate);
 	printrate("mcastrate", tp->mcastrate);
 	printrate("mgmtrate", tp->mgmtrate);
 	db_printf(" maxretry %d", tp->maxretry);
 }
 
 static void
 _db_show_ageq(const char *tag, const struct ieee80211_ageq *q)
 {
 	const struct mbuf *m;
 
 	db_printf("%s lock %p len %d maxlen %d drops %d head %p tail %p\n",
 	    tag, &q->aq_lock, q->aq_len, q->aq_maxlen, q->aq_drops,
 	    q->aq_head, q->aq_tail);
 	for (m = q->aq_head; m != NULL; m = m->m_nextpkt)
 		db_printf("%s %p (len %d, %b)\n", tag, m, m->m_len,
 		    /* XXX could be either TX or RX but is mostly TX */
 		    m->m_flags, IEEE80211_MBUF_TX_FLAG_BITS);
 }
 
 static void
 _db_show_stats(const struct ieee80211_stats *is)
 {
 }
 
 #ifdef IEEE80211_SUPPORT_MESH
 static void
 _db_show_mesh(const struct ieee80211_mesh_state *ms)
 {
 	struct ieee80211_mesh_route *rt;
 	int i;
 
 	_db_show_ssid(" meshid ", 0, ms->ms_idlen, ms->ms_id);
 	db_printf("nextseq %u ttl %u flags 0x%x\n", ms->ms_seq,
 	    ms->ms_ttl, ms->ms_flags);
 	db_printf("routing table:\n");
 	i = 0;
 	TAILQ_FOREACH(rt, &ms->ms_routes, rt_next) {
 		db_printf("entry %d:\tdest: %6D nexthop: %6D metric: %u", i,
 		    rt->rt_dest, ":", rt->rt_nexthop, ":", rt->rt_metric);
 
 		db_printf("\tlifetime: %u lastseq: %u priv: %p\n",
 		    ieee80211_mesh_rt_update(rt, 0),
 		    rt->rt_lastmseq, rt->rt_priv);
 		i++;
 	}
 }
 #endif /* IEEE80211_SUPPORT_MESH */
 #endif /* DDB */
diff --git a/sys/net80211/ieee80211_freebsd.c b/sys/net80211/ieee80211_freebsd.c
index 7158ada291ab..93b01af1d222 100644
--- a/sys/net80211/ieee80211_freebsd.c
+++ b/sys/net80211/ieee80211_freebsd.c
@@ -1,1194 +1,1195 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003-2009 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * IEEE 802.11 support (FreeBSD-specific code)
  */
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>   
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <sys/socket.h>
 
 #include <net/bpf.h>
 #include <net/debugnet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_clone.h>
 #include <net/if_media.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_input.h>
 
 DEBUGNET_DEFINE(ieee80211);
 SYSCTL_NODE(_net, OID_AUTO, wlan, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "IEEE 80211 parameters");
 
 #ifdef IEEE80211_DEBUG
 static int	ieee80211_debug = 0;
 SYSCTL_INT(_net_wlan, OID_AUTO, debug, CTLFLAG_RW, &ieee80211_debug,
 	    0, "debugging printfs");
 #endif
 
 static const char wlanname[] = "wlan";
 static struct if_clone *wlan_cloner;
 
 /*
  * priv(9) NET80211 checks.
  * Return 0 if operation is allowed, E* (usually EPERM) otherwise.
  */
 int
 ieee80211_priv_check_vap_getkey(u_long cmd __unused,
      struct ieee80211vap *vap __unused, struct ifnet *ifp __unused)
 {
 
 	return (priv_check(curthread, PRIV_NET80211_VAP_GETKEY));
 }
 
 int
 ieee80211_priv_check_vap_manage(u_long cmd __unused,
      struct ieee80211vap *vap __unused, struct ifnet *ifp __unused)
 {
 
 	return (priv_check(curthread, PRIV_NET80211_VAP_MANAGE));
 }
 
 int
 ieee80211_priv_check_vap_setmac(u_long cmd __unused,
      struct ieee80211vap *vap __unused, struct ifnet *ifp __unused)
 {
 
 	return (priv_check(curthread, PRIV_NET80211_VAP_SETMAC));
 }
 
 int
 ieee80211_priv_check_create_vap(u_long cmd __unused,
     struct ieee80211vap *vap __unused, struct ifnet *ifp __unused)
 {
 
 	return (priv_check(curthread, PRIV_NET80211_CREATE_VAP));
 }
 
 static int
 wlan_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	struct ieee80211_clone_params cp;
 	struct ieee80211vap *vap;
 	struct ieee80211com *ic;
 	int error;
 
 	error = ieee80211_priv_check_create_vap(0, NULL, NULL);
 	if (error)
 		return error;
 
 	error = ifc_copyin(ifd, &cp, sizeof(cp));
 	if (error)
 		return error;
 	ic = ieee80211_find_com(cp.icp_parent);
 	if (ic == NULL)
 		return ENXIO;
 	if (cp.icp_opmode >= IEEE80211_OPMODE_MAX) {
 		ic_printf(ic, "%s: invalid opmode %d\n", __func__,
 		    cp.icp_opmode);
 		return EINVAL;
 	}
 	if ((ic->ic_caps & ieee80211_opcap[cp.icp_opmode]) == 0) {
 		ic_printf(ic, "%s mode not supported\n",
 		    ieee80211_opmode_name[cp.icp_opmode]);
 		return EOPNOTSUPP;
 	}
 	if ((cp.icp_flags & IEEE80211_CLONE_TDMA) &&
 #ifdef IEEE80211_SUPPORT_TDMA
 	    (ic->ic_caps & IEEE80211_C_TDMA) == 0
 #else
 	    (1)
 #endif
 	) {
 		ic_printf(ic, "TDMA not supported\n");
 		return EOPNOTSUPP;
 	}
 	vap = ic->ic_vap_create(ic, wlanname, ifd->unit,
 			cp.icp_opmode, cp.icp_flags, cp.icp_bssid,
 			cp.icp_flags & IEEE80211_CLONE_MACADDR ?
 			    cp.icp_macaddr : ic->ic_macaddr);
 
 	if (vap == NULL)
 		return (EIO);
 
 #ifdef DEBUGNET
 	if (ic->ic_debugnet_meth != NULL)
 		DEBUGNET_SET(vap->iv_ifp, ieee80211);
 #endif
 	*ifpp = vap->iv_ifp;
 
 	return (0);
 }
 
 static int
 wlan_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	struct ieee80211vap *vap = ifp->if_softc;
 	struct ieee80211com *ic = vap->iv_ic;
 
 	ic->ic_vap_delete(vap);
 
 	return (0);
 }
 
 void
 ieee80211_vap_destroy(struct ieee80211vap *vap)
 {
 	CURVNET_SET(vap->iv_ifp->if_vnet);
 	if_clone_destroyif(wlan_cloner, vap->iv_ifp);
 	CURVNET_RESTORE();
 }
 
 int
 ieee80211_sysctl_msecs_ticks(SYSCTL_HANDLER_ARGS)
 {
 	int msecs = ticks_to_msecs(*(int *)arg1);
 	int error;
 
 	error = sysctl_handle_int(oidp, &msecs, 0, req);
 	if (error || !req->newptr)
 		return error;
 	*(int *)arg1 = msecs_to_ticks(msecs);
 	return 0;
 }
 
 static int
 ieee80211_sysctl_inact(SYSCTL_HANDLER_ARGS)
 {
 	int inact = (*(int *)arg1) * IEEE80211_INACT_WAIT;
 	int error;
 
 	error = sysctl_handle_int(oidp, &inact, 0, req);
 	if (error || !req->newptr)
 		return error;
 	*(int *)arg1 = inact / IEEE80211_INACT_WAIT;
 	return 0;
 }
 
 static int
 ieee80211_sysctl_parent(SYSCTL_HANDLER_ARGS)
 {
 	struct ieee80211com *ic = arg1;
 
 	return SYSCTL_OUT_STR(req, ic->ic_name);
 }
 
 static int
 ieee80211_sysctl_radar(SYSCTL_HANDLER_ARGS)
 {
 	struct ieee80211com *ic = arg1;
 	int t = 0, error;
 
 	error = sysctl_handle_int(oidp, &t, 0, req);
 	if (error || !req->newptr)
 		return error;
 	IEEE80211_LOCK(ic);
 	ieee80211_dfs_notify_radar(ic, ic->ic_curchan);
 	IEEE80211_UNLOCK(ic);
 	return 0;
 }
 
 /*
  * For now, just restart everything.
  *
  * Later on, it'd be nice to have a separate VAP restart to
  * full-device restart.
  */
 static int
 ieee80211_sysctl_vap_restart(SYSCTL_HANDLER_ARGS)
 {
 	struct ieee80211vap *vap = arg1;
 	int t = 0, error;
 
 	error = sysctl_handle_int(oidp, &t, 0, req);
 	if (error || !req->newptr)
 		return error;
 
 	ieee80211_restart_all(vap->iv_ic);
 	return 0;
 }
 
 void
 ieee80211_sysctl_attach(struct ieee80211com *ic)
 {
 }
 
 void
 ieee80211_sysctl_detach(struct ieee80211com *ic)
 {
 }
 
 void
 ieee80211_sysctl_vattach(struct ieee80211vap *vap)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *oid;
 	char num[14];			/* sufficient for 32 bits */
 
 	ctx = (struct sysctl_ctx_list *) IEEE80211_MALLOC(sizeof(struct sysctl_ctx_list),
 		M_DEVBUF, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (ctx == NULL) {
 		if_printf(ifp, "%s: cannot allocate sysctl context!\n",
 			__func__);
 		return;
 	}
 	sysctl_ctx_init(ctx);
 	snprintf(num, sizeof(num), "%u", ifp->if_dunit);
 	oid = SYSCTL_ADD_NODE(ctx, &SYSCTL_NODE_CHILDREN(_net, wlan),
 	    OID_AUTO, num, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "%parent", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
 	    vap->iv_ic, 0, ieee80211_sysctl_parent, "A", "parent device");
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		"driver_caps", CTLFLAG_RW, &vap->iv_caps, 0,
 		"driver capabilities");
 #ifdef IEEE80211_DEBUG
 	vap->iv_debug = ieee80211_debug;
 	SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		"debug", CTLFLAG_RW, &vap->iv_debug, 0,
 		"control debugging printfs");
 #endif
 	SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		"bmiss_max", CTLFLAG_RW, &vap->iv_bmiss_max, 0,
 		"consecutive beacon misses before scanning");
 	/* XXX inherit from tunables */
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "inact_run", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	    &vap->iv_inact_run, 0, ieee80211_sysctl_inact, "I",
 	    "station inactivity timeout (sec)");
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "inact_probe", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	    &vap->iv_inact_probe, 0, ieee80211_sysctl_inact, "I",
 	    "station inactivity probe timeout (sec)");
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "inact_auth", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	    &vap->iv_inact_auth, 0, ieee80211_sysctl_inact, "I",
 	    "station authentication timeout (sec)");
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "inact_init", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	    &vap->iv_inact_init, 0, ieee80211_sysctl_inact, "I",
 	    "station initial state timeout (sec)");
 	if (vap->iv_htcaps & IEEE80211_HTC_HT) {
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 			"ampdu_mintraffic_bk", CTLFLAG_RW,
 			&vap->iv_ampdu_mintraffic[WME_AC_BK], 0,
 			"BK traffic tx aggr threshold (pps)");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 			"ampdu_mintraffic_be", CTLFLAG_RW,
 			&vap->iv_ampdu_mintraffic[WME_AC_BE], 0,
 			"BE traffic tx aggr threshold (pps)");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 			"ampdu_mintraffic_vo", CTLFLAG_RW,
 			&vap->iv_ampdu_mintraffic[WME_AC_VO], 0,
 			"VO traffic tx aggr threshold (pps)");
 		SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 			"ampdu_mintraffic_vi", CTLFLAG_RW,
 			&vap->iv_ampdu_mintraffic[WME_AC_VI], 0,
 			"VI traffic tx aggr threshold (pps)");
 	}
 
 	SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 	    "force_restart", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	    vap, 0, ieee80211_sysctl_vap_restart, "I", "force a VAP restart");
 
 	if (vap->iv_caps & IEEE80211_C_DFS) {
 		SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    "radar", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 		    vap->iv_ic, 0, ieee80211_sysctl_radar, "I",
 		    "simulate radar event");
 	}
 	vap->iv_sysctl = ctx;
 	vap->iv_oid = oid;
 }
 
 void
 ieee80211_sysctl_vdetach(struct ieee80211vap *vap)
 {
 
 	if (vap->iv_sysctl != NULL) {
 		sysctl_ctx_free(vap->iv_sysctl);
 		IEEE80211_FREE(vap->iv_sysctl, M_DEVBUF);
 		vap->iv_sysctl = NULL;
 	}
 }
 
 int
 ieee80211_com_vincref(struct ieee80211vap *vap)
 {
 	uint32_t ostate;
 
 	ostate = atomic_fetchadd_32(&vap->iv_com_state, IEEE80211_COM_REF_ADD);
 
 	if (ostate & IEEE80211_COM_DETACHED) {
 		atomic_subtract_32(&vap->iv_com_state, IEEE80211_COM_REF_ADD);
 		return (ENETDOWN);
 	}
 
 	if (_IEEE80211_MASKSHIFT(ostate, IEEE80211_COM_REF) ==
 	    IEEE80211_COM_REF_MAX) {
 		atomic_subtract_32(&vap->iv_com_state, IEEE80211_COM_REF_ADD);
 		return (EOVERFLOW);
 	}
 
 	return (0);
 }
 
 void
 ieee80211_com_vdecref(struct ieee80211vap *vap)
 {
 	uint32_t ostate;
 
 	ostate = atomic_fetchadd_32(&vap->iv_com_state, -IEEE80211_COM_REF_ADD);
 
 	KASSERT(_IEEE80211_MASKSHIFT(ostate, IEEE80211_COM_REF) != 0,
 	    ("com reference counter underflow"));
 
 	(void) ostate;
 }
 
 void
 ieee80211_com_vdetach(struct ieee80211vap *vap)
 {
 	int sleep_time;
 
 	sleep_time = msecs_to_ticks(250);
 	atomic_set_32(&vap->iv_com_state, IEEE80211_COM_DETACHED);
 	while (_IEEE80211_MASKSHIFT(atomic_load_32(&vap->iv_com_state),
 	    IEEE80211_COM_REF) != 0)
 		pause("comref", sleep_time);
 }
 
 int
 ieee80211_node_dectestref(struct ieee80211_node *ni)
 {
 	/* XXX need equivalent of atomic_dec_and_test */
 	atomic_subtract_int(&ni->ni_refcnt, 1);
 	return atomic_cmpset_int(&ni->ni_refcnt, 0, 1);
 }
 
 void
 ieee80211_drain_ifq(struct ifqueue *ifq)
 {
 	struct ieee80211_node *ni;
 	struct mbuf *m;
 
 	for (;;) {
 		IF_DEQUEUE(ifq, m);
 		if (m == NULL)
 			break;
 
 		ni = (struct ieee80211_node *)m->m_pkthdr.rcvif;
 		KASSERT(ni != NULL, ("frame w/o node"));
 		ieee80211_free_node(ni);
 		m->m_pkthdr.rcvif = NULL;
 
 		m_freem(m);
 	}
 }
 
 void
 ieee80211_flush_ifq(struct ifqueue *ifq, struct ieee80211vap *vap)
 {
 	struct ieee80211_node *ni;
 	struct mbuf *m, **mprev;
 
 	IF_LOCK(ifq);
 	mprev = &ifq->ifq_head;
 	while ((m = *mprev) != NULL) {
 		ni = (struct ieee80211_node *)m->m_pkthdr.rcvif;
 		if (ni != NULL && ni->ni_vap == vap) {
 			*mprev = m->m_nextpkt;		/* remove from list */
 			ifq->ifq_len--;
 
 			m_freem(m);
 			ieee80211_free_node(ni);	/* reclaim ref */
 		} else
 			mprev = &m->m_nextpkt;
 	}
 	/* recalculate tail ptr */
 	m = ifq->ifq_head;
 	for (; m != NULL && m->m_nextpkt != NULL; m = m->m_nextpkt)
 		;
 	ifq->ifq_tail = m;
 	IF_UNLOCK(ifq);
 }
 
 /*
  * As above, for mbufs allocated with m_gethdr/MGETHDR
  * or initialized by M_COPY_PKTHDR.
  */
 #define	MC_ALIGN(m, len)						\
 do {									\
 	(m)->m_data += rounddown2(MCLBYTES - (len), sizeof(long));	\
 } while (/* CONSTCOND */ 0)
 
 /*
  * Allocate and setup a management frame of the specified
  * size.  We return the mbuf and a pointer to the start
  * of the contiguous data area that's been reserved based
  * on the packet length.  The data area is forced to 32-bit
  * alignment and the buffer length to a multiple of 4 bytes.
  * This is done mainly so beacon frames (that require this)
  * can use this interface too.
  */
 struct mbuf *
 ieee80211_getmgtframe(uint8_t **frm, int headroom, int pktlen)
 {
 	struct mbuf *m;
 	u_int len;
 
 	/*
 	 * NB: we know the mbuf routines will align the data area
 	 *     so we don't need to do anything special.
 	 */
 	len = roundup2(headroom + pktlen, 4);
 	KASSERT(len <= MCLBYTES, ("802.11 mgt frame too large: %u", len));
 	if (len < MINCLSIZE) {
 		m = m_gethdr(IEEE80211_M_NOWAIT, MT_DATA);
 		/*
 		 * Align the data in case additional headers are added.
 		 * This should only happen when a WEP header is added
 		 * which only happens for shared key authentication mgt
 		 * frames which all fit in MHLEN.
 		 */
 		if (m != NULL)
 			M_ALIGN(m, len);
 	} else {
 		m = m_getcl(IEEE80211_M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m != NULL)
 			MC_ALIGN(m, len);
 	}
 	if (m != NULL) {
 		m->m_data += headroom;
 		*frm = m->m_data;
 	}
 	return m;
 }
 
 #ifndef __NO_STRICT_ALIGNMENT
 /*
  * Re-align the payload in the mbuf.  This is mainly used (right now)
  * to handle IP header alignment requirements on certain architectures.
  */
 struct mbuf *
 ieee80211_realign(struct ieee80211vap *vap, struct mbuf *m, size_t align)
 {
 	int pktlen, space;
 	struct mbuf *n;
 
 	pktlen = m->m_pkthdr.len;
 	space = pktlen + align;
 	if (space < MINCLSIZE)
 		n = m_gethdr(IEEE80211_M_NOWAIT, MT_DATA);
 	else {
 		n = m_getjcl(IEEE80211_M_NOWAIT, MT_DATA, M_PKTHDR,
 		    space <= MCLBYTES ?     MCLBYTES :
 #if MJUMPAGESIZE != MCLBYTES
 		    space <= MJUMPAGESIZE ? MJUMPAGESIZE :
 #endif
 		    space <= MJUM9BYTES ?   MJUM9BYTES : MJUM16BYTES);
 	}
 	if (__predict_true(n != NULL)) {
 		m_move_pkthdr(n, m);
 		n->m_data = (caddr_t)(ALIGN(n->m_data + align) - align);
 		m_copydata(m, 0, pktlen, mtod(n, caddr_t));
 		n->m_len = pktlen;
 	} else {
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    mtod(m, const struct ieee80211_frame *), NULL,
 		    "%s", "no mbuf to realign");
 		vap->iv_stats.is_rx_badalign++;
 	}
 	m_freem(m);
 	return n;
 }
 #endif /* !__NO_STRICT_ALIGNMENT */
 
 int
 ieee80211_add_callback(struct mbuf *m,
 	void (*func)(struct ieee80211_node *, void *, int), void *arg)
 {
 	struct m_tag *mtag;
 	struct ieee80211_cb *cb;
 
 	mtag = m_tag_alloc(MTAG_ABI_NET80211, NET80211_TAG_CALLBACK,
 			sizeof(struct ieee80211_cb), IEEE80211_M_NOWAIT);
 	if (mtag == NULL)
 		return 0;
 
 	cb = (struct ieee80211_cb *)(mtag+1);
 	cb->func = func;
 	cb->arg = arg;
 	m_tag_prepend(m, mtag);
 	m->m_flags |= M_TXCB;
 	return 1;
 }
 
 int
 ieee80211_add_xmit_params(struct mbuf *m,
     const struct ieee80211_bpf_params *params)
 {
 	struct m_tag *mtag;
 	struct ieee80211_tx_params *tx;
 
 	mtag = m_tag_alloc(MTAG_ABI_NET80211, NET80211_TAG_XMIT_PARAMS,
 	    sizeof(struct ieee80211_tx_params), IEEE80211_M_NOWAIT);
 	if (mtag == NULL)
 		return (0);
 
 	tx = (struct ieee80211_tx_params *)(mtag+1);
 	memcpy(&tx->params, params, sizeof(struct ieee80211_bpf_params));
 	m_tag_prepend(m, mtag);
 	return (1);
 }
 
 int
 ieee80211_get_xmit_params(struct mbuf *m,
     struct ieee80211_bpf_params *params)
 {
 	struct m_tag *mtag;
 	struct ieee80211_tx_params *tx;
 
 	mtag = m_tag_locate(m, MTAG_ABI_NET80211, NET80211_TAG_XMIT_PARAMS,
 	    NULL);
 	if (mtag == NULL)
 		return (-1);
 	tx = (struct ieee80211_tx_params *)(mtag + 1);
 	memcpy(params, &tx->params, sizeof(struct ieee80211_bpf_params));
 	return (0);
 }
 
 void
 ieee80211_process_callback(struct ieee80211_node *ni,
 	struct mbuf *m, int status)
 {
 	struct m_tag *mtag;
 
 	mtag = m_tag_locate(m, MTAG_ABI_NET80211, NET80211_TAG_CALLBACK, NULL);
 	if (mtag != NULL) {
 		struct ieee80211_cb *cb = (struct ieee80211_cb *)(mtag+1);
 		cb->func(ni, cb->arg, status);
 	}
 }
 
 /*
  * Add RX parameters to the given mbuf.
  *
  * Returns 1 if OK, 0 on error.
  */
 int
 ieee80211_add_rx_params(struct mbuf *m, const struct ieee80211_rx_stats *rxs)
 {
 	struct m_tag *mtag;
 	struct ieee80211_rx_params *rx;
 
 	mtag = m_tag_alloc(MTAG_ABI_NET80211, NET80211_TAG_RECV_PARAMS,
 	    sizeof(struct ieee80211_rx_stats), IEEE80211_M_NOWAIT);
 	if (mtag == NULL)
 		return (0);
 
 	rx = (struct ieee80211_rx_params *)(mtag + 1);
 	memcpy(&rx->params, rxs, sizeof(*rxs));
 	m_tag_prepend(m, mtag);
 	return (1);
 }
 
 int
 ieee80211_get_rx_params(struct mbuf *m, struct ieee80211_rx_stats *rxs)
 {
 	struct m_tag *mtag;
 	struct ieee80211_rx_params *rx;
 
 	mtag = m_tag_locate(m, MTAG_ABI_NET80211, NET80211_TAG_RECV_PARAMS,
 	    NULL);
 	if (mtag == NULL)
 		return (-1);
 	rx = (struct ieee80211_rx_params *)(mtag + 1);
 	memcpy(rxs, &rx->params, sizeof(*rxs));
 	return (0);
 }
 
 const struct ieee80211_rx_stats *
 ieee80211_get_rx_params_ptr(struct mbuf *m)
 {
 	struct m_tag *mtag;
 	struct ieee80211_rx_params *rx;
 
 	mtag = m_tag_locate(m, MTAG_ABI_NET80211, NET80211_TAG_RECV_PARAMS,
 	    NULL);
 	if (mtag == NULL)
 		return (NULL);
 	rx = (struct ieee80211_rx_params *)(mtag + 1);
 	return (&rx->params);
 }
 
 /*
  * Add TOA parameters to the given mbuf.
  */
 int
 ieee80211_add_toa_params(struct mbuf *m, const struct ieee80211_toa_params *p)
 {
 	struct m_tag *mtag;
 	struct ieee80211_toa_params *rp;
 
 	mtag = m_tag_alloc(MTAG_ABI_NET80211, NET80211_TAG_TOA_PARAMS,
 	    sizeof(struct ieee80211_toa_params), IEEE80211_M_NOWAIT);
 	if (mtag == NULL)
 		return (0);
 
 	rp = (struct ieee80211_toa_params *)(mtag + 1);
 	memcpy(rp, p, sizeof(*rp));
 	m_tag_prepend(m, mtag);
 	return (1);
 }
 
 int
 ieee80211_get_toa_params(struct mbuf *m, struct ieee80211_toa_params *p)
 {
 	struct m_tag *mtag;
 	struct ieee80211_toa_params *rp;
 
 	mtag = m_tag_locate(m, MTAG_ABI_NET80211, NET80211_TAG_TOA_PARAMS,
 	    NULL);
 	if (mtag == NULL)
 		return (0);
 	rp = (struct ieee80211_toa_params *)(mtag + 1);
 	if (p != NULL)
 		memcpy(p, rp, sizeof(*p));
 	return (1);
 }
 
 /*
  * Transmit a frame to the parent interface.
  */
 int
 ieee80211_parent_xmitpkt(struct ieee80211com *ic, struct mbuf *m)
 {
 	int error;
 
 	/*
 	 * Assert the IC TX lock is held - this enforces the
 	 * processing -> queuing order is maintained
 	 */
 	IEEE80211_TX_LOCK_ASSERT(ic);
 	error = ic->ic_transmit(ic, m);
 	if (error) {
 		struct ieee80211_node *ni;
 
 		ni = (struct ieee80211_node *)m->m_pkthdr.rcvif;
 
 		/* XXX number of fragments */
 		if_inc_counter(ni->ni_vap->iv_ifp, IFCOUNTER_OERRORS, 1);
 		ieee80211_free_node(ni);
 		ieee80211_free_mbuf(m);
 	}
 	return (error);
 }
 
 /*
  * Transmit a frame to the VAP interface.
  */
 int
 ieee80211_vap_xmitpkt(struct ieee80211vap *vap, struct mbuf *m)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 
 	/*
 	 * When transmitting via the VAP, we shouldn't hold
 	 * any IC TX lock as the VAP TX path will acquire it.
 	 */
 	IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic);
 
 	return (ifp->if_transmit(ifp, m));
 
 }
 
 #include <sys/libkern.h>
 
 void
 net80211_get_random_bytes(void *p, size_t n)
 {
 	uint8_t *dp = p;
 
 	while (n > 0) {
 		uint32_t v = arc4random();
 		size_t nb = n > sizeof(uint32_t) ? sizeof(uint32_t) : n;
 		bcopy(&v, dp, n > sizeof(uint32_t) ? sizeof(uint32_t) : n);
 		dp += sizeof(uint32_t), n -= nb;
 	}
 }
 
 /*
  * Helper function for events that pass just a single mac address.
  */
 static void
 notify_macaddr(struct ifnet *ifp, int op, const uint8_t mac[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_join_event iev;
 
 	CURVNET_SET(ifp->if_vnet);
 	memset(&iev, 0, sizeof(iev));
 	IEEE80211_ADDR_COPY(iev.iev_addr, mac);
 	rt_ieee80211msg(ifp, op, &iev, sizeof(iev));
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_node_join(struct ieee80211_node *ni, int newassoc)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%snode join",
 	    (ni == vap->iv_bss) ? "bss " : "");
 
 	if (ni == vap->iv_bss) {
 		notify_macaddr(ifp, newassoc ?
 		    RTM_IEEE80211_ASSOC : RTM_IEEE80211_REASSOC, ni->ni_bssid);
 		if_link_state_change(ifp, LINK_STATE_UP);
 	} else {
 		notify_macaddr(ifp, newassoc ?
 		    RTM_IEEE80211_JOIN : RTM_IEEE80211_REJOIN, ni->ni_macaddr);
 	}
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_node_leave(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%snode leave",
 	    (ni == vap->iv_bss) ? "bss " : "");
 
 	if (ni == vap->iv_bss) {
 		rt_ieee80211msg(ifp, RTM_IEEE80211_DISASSOC, NULL, 0);
 		if_link_state_change(ifp, LINK_STATE_DOWN);
 	} else {
 		/* fire off wireless event station leaving */
 		notify_macaddr(ifp, RTM_IEEE80211_LEAVE, ni->ni_macaddr);
 	}
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_scan_done(struct ieee80211vap *vap)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s\n", "notify scan done");
 
 	/* dispatch wireless event indicating scan completed */
 	CURVNET_SET(ifp->if_vnet);
 	rt_ieee80211msg(ifp, RTM_IEEE80211_SCAN, NULL, 0);
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_replay_failure(struct ieee80211vap *vap,
 	const struct ieee80211_frame *wh, const struct ieee80211_key *k,
 	u_int64_t rsc, int tid)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_CRYPTO, wh->i_addr2,
 	    "%s replay detected tid %d <rsc %ju (%jx), csc %ju (%jx), keyix %u rxkeyix %u>",
 	    k->wk_cipher->ic_name, tid,
 	    (intmax_t) rsc,
 	    (intmax_t) rsc,
 	    (intmax_t) k->wk_keyrsc[tid],
 	    (intmax_t) k->wk_keyrsc[tid],
 	    k->wk_keyix, k->wk_rxkeyix);
 
 	if (ifp != NULL) {		/* NB: for cipher test modules */
 		struct ieee80211_replay_event iev;
 
 		IEEE80211_ADDR_COPY(iev.iev_dst, wh->i_addr1);
 		IEEE80211_ADDR_COPY(iev.iev_src, wh->i_addr2);
 		iev.iev_cipher = k->wk_cipher->ic_cipher;
 		if (k->wk_rxkeyix != IEEE80211_KEYIX_NONE)
 			iev.iev_keyix = k->wk_rxkeyix;
 		else
 			iev.iev_keyix = k->wk_keyix;
 		iev.iev_keyrsc = k->wk_keyrsc[tid];
 		iev.iev_rsc = rsc;
 		CURVNET_SET(ifp->if_vnet);
 		rt_ieee80211msg(ifp, RTM_IEEE80211_REPLAY, &iev, sizeof(iev));
 		CURVNET_RESTORE();
 	}
 }
 
 void
 ieee80211_notify_michael_failure(struct ieee80211vap *vap,
 	const struct ieee80211_frame *wh, u_int keyix)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_CRYPTO, wh->i_addr2,
 	    "michael MIC verification failed <keyix %u>", keyix);
 	vap->iv_stats.is_rx_tkipmic++;
 
 	if (ifp != NULL) {		/* NB: for cipher test modules */
 		struct ieee80211_michael_event iev;
 
 		IEEE80211_ADDR_COPY(iev.iev_dst, wh->i_addr1);
 		IEEE80211_ADDR_COPY(iev.iev_src, wh->i_addr2);
 		iev.iev_cipher = IEEE80211_CIPHER_TKIP;
 		iev.iev_keyix = keyix;
 		CURVNET_SET(ifp->if_vnet);
 		rt_ieee80211msg(ifp, RTM_IEEE80211_MICHAEL, &iev, sizeof(iev));
 		CURVNET_RESTORE();
 	}
 }
 
 void
 ieee80211_notify_wds_discover(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	notify_macaddr(ifp, RTM_IEEE80211_WDS, ni->ni_macaddr);
 }
 
 void
 ieee80211_notify_csa(struct ieee80211com *ic,
 	const struct ieee80211_channel *c, int mode, int count)
 {
 	struct ieee80211_csa_event iev;
 	struct ieee80211vap *vap;
 	struct ifnet *ifp;
 
 	memset(&iev, 0, sizeof(iev));
 	iev.iev_flags = c->ic_flags;
 	iev.iev_freq = c->ic_freq;
 	iev.iev_ieee = c->ic_ieee;
 	iev.iev_mode = mode;
 	iev.iev_count = count;
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		ifp = vap->iv_ifp;
 		CURVNET_SET(ifp->if_vnet);
 		rt_ieee80211msg(ifp, RTM_IEEE80211_CSA, &iev, sizeof(iev));
 		CURVNET_RESTORE();
 	}
 }
 
 void
 ieee80211_notify_radar(struct ieee80211com *ic,
 	const struct ieee80211_channel *c)
 {
 	struct ieee80211_radar_event iev;
 	struct ieee80211vap *vap;
 	struct ifnet *ifp;
 
 	memset(&iev, 0, sizeof(iev));
 	iev.iev_flags = c->ic_flags;
 	iev.iev_freq = c->ic_freq;
 	iev.iev_ieee = c->ic_ieee;
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		ifp = vap->iv_ifp;
 		CURVNET_SET(ifp->if_vnet);
 		rt_ieee80211msg(ifp, RTM_IEEE80211_RADAR, &iev, sizeof(iev));
 		CURVNET_RESTORE();
 	}
 }
 
 void
 ieee80211_notify_cac(struct ieee80211com *ic,
 	const struct ieee80211_channel *c, enum ieee80211_notify_cac_event type)
 {
 	struct ieee80211_cac_event iev;
 	struct ieee80211vap *vap;
 	struct ifnet *ifp;
 
 	memset(&iev, 0, sizeof(iev));
 	iev.iev_flags = c->ic_flags;
 	iev.iev_freq = c->ic_freq;
 	iev.iev_ieee = c->ic_ieee;
 	iev.iev_type = type;
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		ifp = vap->iv_ifp;
 		CURVNET_SET(ifp->if_vnet);
 		rt_ieee80211msg(ifp, RTM_IEEE80211_CAC, &iev, sizeof(iev));
 		CURVNET_RESTORE();
 	}
 }
 
 void
 ieee80211_notify_node_deauth(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%s", "node deauth");
 
 	notify_macaddr(ifp, RTM_IEEE80211_DEAUTH, ni->ni_macaddr);
 }
 
 void
 ieee80211_notify_node_auth(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_NODE, ni, "%s", "node auth");
 
 	notify_macaddr(ifp, RTM_IEEE80211_AUTH, ni->ni_macaddr);
 }
 
 void
 ieee80211_notify_country(struct ieee80211vap *vap,
 	const uint8_t bssid[IEEE80211_ADDR_LEN], const uint8_t cc[2])
 {
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_country_event iev;
 
 	memset(&iev, 0, sizeof(iev));
 	IEEE80211_ADDR_COPY(iev.iev_addr, bssid);
 	iev.iev_cc[0] = cc[0];
 	iev.iev_cc[1] = cc[1];
 	CURVNET_SET(ifp->if_vnet);
 	rt_ieee80211msg(ifp, RTM_IEEE80211_COUNTRY, &iev, sizeof(iev));
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_notify_radio(struct ieee80211com *ic, int state)
 {
 	struct ieee80211_radio_event iev;
 	struct ieee80211vap *vap;
 	struct ifnet *ifp;
 
 	memset(&iev, 0, sizeof(iev));
 	iev.iev_state = state;
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		ifp = vap->iv_ifp;
 		CURVNET_SET(ifp->if_vnet);
 		rt_ieee80211msg(ifp, RTM_IEEE80211_RADIO, &iev, sizeof(iev));
 		CURVNET_RESTORE();
 	}
 }
 
 void
 ieee80211_notify_ifnet_change(struct ieee80211vap *vap, int if_flags_mask)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG, "%s\n",
 	    "interface state change");
 
 	CURVNET_SET(ifp->if_vnet);
 	rt_ifmsg(ifp, if_flags_mask);
 	CURVNET_RESTORE();
 }
 
 void
 ieee80211_load_module(const char *modname)
 {
 
 #ifdef notyet
 	(void)kern_kldload(curthread, modname, NULL);
 #else
 	printf("%s: load the %s module by hand for now.\n", __func__, modname);
 #endif
 }
 
 static eventhandler_tag wlan_bpfevent;
 static eventhandler_tag wlan_ifllevent;
 
 static void
 bpf_track(void *arg, struct ifnet *ifp, int dlt, int attach)
 {
 	/* NB: identify vap's by if_init */
 	if (dlt == DLT_IEEE802_11_RADIO &&
 	    ifp->if_init == ieee80211_init) {
 		struct ieee80211vap *vap = ifp->if_softc;
 		/*
 		 * Track bpf radiotap listener state.  We mark the vap
 		 * to indicate if any listener is present and the com
 		 * to indicate if any listener exists on any associated
 		 * vap.  This flag is used by drivers to prepare radiotap
 		 * state only when needed.
 		 */
 		if (attach) {
 			ieee80211_syncflag_ext(vap, IEEE80211_FEXT_BPF);
 			if (vap->iv_opmode == IEEE80211_M_MONITOR)
 				atomic_add_int(&vap->iv_ic->ic_montaps, 1);
 		} else if (!bpf_peers_present(vap->iv_rawbpf)) {
 			ieee80211_syncflag_ext(vap, -IEEE80211_FEXT_BPF);
 			if (vap->iv_opmode == IEEE80211_M_MONITOR)
 				atomic_subtract_int(&vap->iv_ic->ic_montaps, 1);
 		}
 	}
 }
 
 /*
  * Change MAC address on the vap (if was not started).
  */
 static void
 wlan_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 	/* NB: identify vap's by if_init */
 	if (ifp->if_init == ieee80211_init &&
 	    (ifp->if_flags & IFF_UP) == 0) {
 		struct ieee80211vap *vap = ifp->if_softc;
 
 		IEEE80211_ADDR_COPY(vap->iv_myaddr, IF_LLADDR(ifp));
 	}
 }
 
 /*
  * Fetch the VAP name.
  *
  * This returns a const char pointer suitable for debugging,
  * but don't expect it to stick around for much longer.
  */
 const char *
 ieee80211_get_vap_ifname(struct ieee80211vap *vap)
 {
 	if (vap->iv_ifp == NULL)
 		return "(none)";
 	return vap->iv_ifp->if_xname;
 }
 
 #ifdef DEBUGNET
 static void
 ieee80211_debugnet_init(struct ifnet *ifp, int *nrxr, int *ncl, int *clsize)
 {
 	struct ieee80211vap *vap;
 	struct ieee80211com *ic;
 
 	vap = if_getsoftc(ifp);
 	ic = vap->iv_ic;
 
 	IEEE80211_LOCK(ic);
 	ic->ic_debugnet_meth->dn8_init(ic, nrxr, ncl, clsize);
 	IEEE80211_UNLOCK(ic);
 }
 
 static void
 ieee80211_debugnet_event(struct ifnet *ifp, enum debugnet_ev ev)
 {
 	struct ieee80211vap *vap;
 	struct ieee80211com *ic;
 
 	vap = if_getsoftc(ifp);
 	ic = vap->iv_ic;
 
 	IEEE80211_LOCK(ic);
 	ic->ic_debugnet_meth->dn8_event(ic, ev);
 	IEEE80211_UNLOCK(ic);
 }
 
 static int
 ieee80211_debugnet_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	return (ieee80211_vap_transmit(ifp, m));
 }
 
 static int
 ieee80211_debugnet_poll(struct ifnet *ifp, int count)
 {
 	struct ieee80211vap *vap;
 	struct ieee80211com *ic;
 
 	vap = if_getsoftc(ifp);
 	ic = vap->iv_ic;
 
 	return (ic->ic_debugnet_meth->dn8_poll(ic, count));
 }
 #endif
 
 /*
  * Module glue.
  *
  * NB: the module name is "wlan" for compatibility with NetBSD.
  */
 static int
 wlan_modevent(module_t mod, int type, void *unused)
 {
 	switch (type) {
 	case MOD_LOAD:
 		if (bootverbose)
 			printf("wlan: <802.11 Link Layer>\n");
 		wlan_bpfevent = EVENTHANDLER_REGISTER(bpf_track,
 		    bpf_track, 0, EVENTHANDLER_PRI_ANY);
 		wlan_ifllevent = EVENTHANDLER_REGISTER(iflladdr_event,
 		    wlan_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 		struct if_clone_addreq req = {
 			.create_f = wlan_clone_create,
 			.destroy_f = wlan_clone_destroy,
 			.flags = IFC_F_AUTOUNIT,
 		};
 		wlan_cloner = ifc_attach_cloner(wlanname, &req);
 		return 0;
 	case MOD_UNLOAD:
 		ifc_detach_cloner(wlan_cloner);
 		EVENTHANDLER_DEREGISTER(bpf_track, wlan_bpfevent);
 		EVENTHANDLER_DEREGISTER(iflladdr_event, wlan_ifllevent);
 		return 0;
 	}
 	return EINVAL;
 }
 
 static moduledata_t wlan_mod = {
 	wlanname,
 	wlan_modevent,
 	0
 };
 DECLARE_MODULE(wlan, wlan_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
 MODULE_VERSION(wlan, 1);
 MODULE_DEPEND(wlan, ether, 1, 1, 1);
 #ifdef	IEEE80211_ALQ
 MODULE_DEPEND(wlan, alq, 1, 1, 1);
 #endif	/* IEEE80211_ALQ */
diff --git a/sys/net80211/ieee80211_hostap.c b/sys/net80211/ieee80211_hostap.c
index 6c3bb44053f5..12b34b2e0509 100644
--- a/sys/net80211/ieee80211_hostap.c
+++ b/sys/net80211/ieee80211_hostap.c
@@ -1,2480 +1,2481 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007-2008 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #ifdef __FreeBSD__
 __FBSDID("$FreeBSD$");
 #endif
 
 /*
  * IEEE 802.11 HOSTAP mode support.
  */
 #include "opt_inet.h"
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/mbuf.h>   
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_llc.h>
+#include <net/if_private.h>
 #include <net/ethernet.h>
 
 #include <net/bpf.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_hostap.h>
 #include <net80211/ieee80211_input.h>
 #ifdef IEEE80211_SUPPORT_SUPERG
 #include <net80211/ieee80211_superg.h>
 #endif
 #include <net80211/ieee80211_wds.h>
 #include <net80211/ieee80211_vht.h>
 #include <net80211/ieee80211_sta.h> /* for parse_wmeie */
 
 #define	IEEE80211_RATE2MBS(r)	(((r) & IEEE80211_RATE_VAL) / 2)
 
 static	void hostap_vattach(struct ieee80211vap *);
 static	int hostap_newstate(struct ieee80211vap *, enum ieee80211_state, int);
 static	int hostap_input(struct ieee80211_node *ni, struct mbuf *m,
 	    const struct ieee80211_rx_stats *,
 	    int rssi, int nf);
 static void hostap_deliver_data(struct ieee80211vap *,
 	    struct ieee80211_node *, struct mbuf *);
 static void hostap_recv_mgmt(struct ieee80211_node *, struct mbuf *,
 	    int subtype, const struct ieee80211_rx_stats *rxs, int rssi, int nf);
 static void hostap_recv_ctl(struct ieee80211_node *, struct mbuf *, int);
 
 void
 ieee80211_hostap_attach(struct ieee80211com *ic)
 {
 	ic->ic_vattach[IEEE80211_M_HOSTAP] = hostap_vattach;
 }
 
 void
 ieee80211_hostap_detach(struct ieee80211com *ic)
 {
 }
 
 static void
 hostap_vdetach(struct ieee80211vap *vap)
 {
 }
 
 static void
 hostap_vattach(struct ieee80211vap *vap)
 {
 	vap->iv_newstate = hostap_newstate;
 	vap->iv_input = hostap_input;
 	vap->iv_recv_mgmt = hostap_recv_mgmt;
 	vap->iv_recv_ctl = hostap_recv_ctl;
 	vap->iv_opdetach = hostap_vdetach;
 	vap->iv_deliver_data = hostap_deliver_data;
 	vap->iv_recv_pspoll = ieee80211_recv_pspoll;
 }
 
 static void
 sta_disassoc(void *arg, struct ieee80211_node *ni)
 {
 
 	if (ni->ni_associd != 0) {
 		IEEE80211_SEND_MGMT(ni, IEEE80211_FC0_SUBTYPE_DISASSOC,
 			IEEE80211_REASON_ASSOC_LEAVE);
 		ieee80211_node_leave(ni);
 	}
 }
 
 static void
 sta_csa(void *arg, struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 
 	if (ni->ni_associd != 0)
 		if (ni->ni_inact > vap->iv_inact_init) {
 			ni->ni_inact = vap->iv_inact_init;
 			IEEE80211_NOTE(vap, IEEE80211_MSG_INACT, ni,
 			    "%s: inact %u", __func__, ni->ni_inact);
 		}
 }
 
 static void
 sta_drop(void *arg, struct ieee80211_node *ni)
 {
 
 	if (ni->ni_associd != 0)
 		ieee80211_node_leave(ni);
 }
 
 /*
  * Does a channel change require associated stations to re-associate
  * so protocol state is correct.  This is used when doing CSA across
  * bands or similar (e.g. HT -> legacy).
  */
 static int
 isbandchange(struct ieee80211com *ic)
 {
 	return ((ic->ic_bsschan->ic_flags ^ ic->ic_csa_newchan->ic_flags) &
 	    (IEEE80211_CHAN_2GHZ | IEEE80211_CHAN_5GHZ | IEEE80211_CHAN_HALF |
 	     IEEE80211_CHAN_QUARTER | IEEE80211_CHAN_HT)) != 0;
 }
 
 /*
  * IEEE80211_M_HOSTAP vap state machine handler.
  */
 static int
 hostap_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	enum ieee80211_state ostate;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	ostate = vap->iv_state;
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s -> %s (%d)\n",
 	    __func__, ieee80211_state_name[ostate],
 	    ieee80211_state_name[nstate], arg);
 	vap->iv_state = nstate;			/* state transition */
 	if (ostate != IEEE80211_S_SCAN)
 		ieee80211_cancel_scan(vap);	/* background scan */
 	switch (nstate) {
 	case IEEE80211_S_INIT:
 		switch (ostate) {
 		case IEEE80211_S_SCAN:
 			ieee80211_cancel_scan(vap);
 			break;
 		case IEEE80211_S_CAC:
 			ieee80211_dfs_cac_stop(vap);
 			break;
 		case IEEE80211_S_RUN:
 			ieee80211_iterate_nodes_vap(&ic->ic_sta, vap,
 			    sta_disassoc, NULL);
 			break;
 		default:
 			break;
 		}
 		if (ostate != IEEE80211_S_INIT) {
 			/* NB: optimize INIT -> INIT case */
 			ieee80211_reset_bss(vap);
 		}
 		if (vap->iv_auth->ia_detach != NULL)
 			vap->iv_auth->ia_detach(vap);
 		break;
 	case IEEE80211_S_SCAN:
 		switch (ostate) {
 		case IEEE80211_S_CSA:
 		case IEEE80211_S_RUN:
 			ieee80211_iterate_nodes_vap(&ic->ic_sta, vap,
 			    sta_disassoc, NULL);
 			/*
 			 * Clear overlapping BSS state; the beacon frame
 			 * will be reconstructed on transition to the RUN
 			 * state and the timeout routines check if the flag
 			 * is set before doing anything so this is sufficient.
 			 */
 			vap->iv_flags_ext &= ~IEEE80211_FEXT_NONERP_PR;
 			vap->iv_flags_ht &= ~IEEE80211_FHT_NONHT_PR;
 			/* XXX TODO: schedule deferred update? */
 			/* fall thru... */
 		case IEEE80211_S_CAC:
 			/*
 			 * NB: We may get here because of a manual channel
 			 *     change in which case we need to stop CAC
 			 * XXX no need to stop if ostate RUN but it's ok
 			 */
 			ieee80211_dfs_cac_stop(vap);
 			/* fall thru... */
 		case IEEE80211_S_INIT:
 			if (vap->iv_des_chan != IEEE80211_CHAN_ANYC &&
 			    !IEEE80211_IS_CHAN_RADAR(vap->iv_des_chan)) {
 				/*
 				 * Already have a channel; bypass the
 				 * scan and startup immediately.  
 				 * ieee80211_create_ibss will call back to
 				 * move us to RUN state.
 				 */
 				ieee80211_create_ibss(vap, vap->iv_des_chan);
 				break;
 			}
 			/*
 			 * Initiate a scan.  We can come here as a result
 			 * of an IEEE80211_IOC_SCAN_REQ too in which case
 			 * the vap will be marked with IEEE80211_FEXT_SCANREQ
 			 * and the scan request parameters will be present
 			 * in iv_scanreq.  Otherwise we do the default.
 			 */
 			if (vap->iv_flags_ext & IEEE80211_FEXT_SCANREQ) {
 				ieee80211_check_scan(vap,
 				    vap->iv_scanreq_flags,
 				    vap->iv_scanreq_duration,
 				    vap->iv_scanreq_mindwell,
 				    vap->iv_scanreq_maxdwell,
 				    vap->iv_scanreq_nssid, vap->iv_scanreq_ssid);
 				vap->iv_flags_ext &= ~IEEE80211_FEXT_SCANREQ;
 			} else
 				ieee80211_check_scan_current(vap);
 			break;
 		case IEEE80211_S_SCAN:
 			/*
 			 * A state change requires a reset; scan.
 			 */
 			ieee80211_check_scan_current(vap);
 			break;
 		default:
 			break;
 		}
 		break;
 	case IEEE80211_S_CAC:
 		/*
 		 * Start CAC on a DFS channel.  We come here when starting
 		 * a bss on a DFS channel (see ieee80211_create_ibss).
 		 */
 		ieee80211_dfs_cac_start(vap);
 		break;
 	case IEEE80211_S_RUN:
 		if (vap->iv_flags & IEEE80211_F_WPA) {
 			/* XXX validate prerequisites */
 		}
 		switch (ostate) {
 		case IEEE80211_S_INIT:
 			/*
 			 * Already have a channel; bypass the
 			 * scan and startup immediately.
 			 * Note that ieee80211_create_ibss will call
 			 * back to do a RUN->RUN state change.
 			 */
 			ieee80211_create_ibss(vap,
 			    ieee80211_ht_adjust_channel(ic,
 				ic->ic_curchan, vap->iv_flags_ht));
 			/* NB: iv_bss is changed on return */
 			break;
 		case IEEE80211_S_CAC:
 			/*
 			 * NB: This is the normal state change when CAC
 			 * expires and no radar was detected; no need to
 			 * clear the CAC timer as it's already expired.
 			 */
 			/* fall thru... */
 		case IEEE80211_S_CSA:
 			/*
 			 * Shorten inactivity timer of associated stations
 			 * to weed out sta's that don't follow a CSA.
 			 */
 			ieee80211_iterate_nodes_vap(&ic->ic_sta, vap,
 			    sta_csa, NULL);
 			/*
 			 * Update bss node channel to reflect where
 			 * we landed after CSA.
 			 */
 			ieee80211_node_set_chan(vap->iv_bss,
 			    ieee80211_ht_adjust_channel(ic, ic->ic_curchan,
 				ieee80211_htchanflags(vap->iv_bss->ni_chan)));
 			/* XXX bypass debug msgs */
 			break;
 		case IEEE80211_S_SCAN:
 		case IEEE80211_S_RUN:
 #ifdef IEEE80211_DEBUG
 			if (ieee80211_msg_debug(vap)) {
 				struct ieee80211_node *ni = vap->iv_bss;
 				ieee80211_note(vap,
 				    "synchronized with %s ssid ",
 				    ether_sprintf(ni->ni_bssid));
 				ieee80211_print_essid(ni->ni_essid,
 				    ni->ni_esslen);
 				/* XXX MCS/HT */
 				printf(" channel %d start %uMb\n",
 				    ieee80211_chan2ieee(ic, ic->ic_curchan),
 				    IEEE80211_RATE2MBS(ni->ni_txrate));
 			}
 #endif
 			break;
 		default:
 			break;
 		}
 		/*
 		 * Start/stop the authenticator.  We delay until here
 		 * to allow configuration to happen out of order.
 		 */
 		if (vap->iv_auth->ia_attach != NULL) {
 			/* XXX check failure */
 			vap->iv_auth->ia_attach(vap);
 		} else if (vap->iv_auth->ia_detach != NULL) {
 			vap->iv_auth->ia_detach(vap);
 		}
 		ieee80211_node_authorize(vap->iv_bss);
 		break;
 	case IEEE80211_S_CSA:
 		if (ostate == IEEE80211_S_RUN && isbandchange(ic)) {
 			/*
 			 * On a ``band change'' silently drop associated
 			 * stations as they must re-associate before they
 			 * can pass traffic (as otherwise protocol state
 			 * such as capabilities and the negotiated rate
 			 * set may/will be wrong).
 			 */
 			ieee80211_iterate_nodes_vap(&ic->ic_sta, vap,
 			    sta_drop, NULL);
 		}
 		break;
 	default:
 		break;
 	}
 	return 0;
 }
 
 static void
 hostap_deliver_data(struct ieee80211vap *vap,
 	struct ieee80211_node *ni, struct mbuf *m)
 {
 	struct ether_header *eh = mtod(m, struct ether_header *);
 	struct ifnet *ifp = vap->iv_ifp;
 
 	/* clear driver/net80211 flags before passing up */
 	m->m_flags &= ~(M_MCAST | M_BCAST);
 	m_clrprotoflags(m);
 
 	KASSERT(vap->iv_opmode == IEEE80211_M_HOSTAP,
 	    ("gack, opmode %d", vap->iv_opmode));
 	/*
 	 * Do accounting.
 	 */
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	IEEE80211_NODE_STAT(ni, rx_data);
 	IEEE80211_NODE_STAT_ADD(ni, rx_bytes, m->m_pkthdr.len);
 	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 		m->m_flags |= M_MCAST;		/* XXX M_BCAST? */
 		IEEE80211_NODE_STAT(ni, rx_mcast);
 	} else
 		IEEE80211_NODE_STAT(ni, rx_ucast);
 
 	/* perform as a bridge within the AP */
 	if ((vap->iv_flags & IEEE80211_F_NOBRIDGE) == 0) {
 		struct mbuf *mcopy = NULL;
 
 		if (m->m_flags & M_MCAST) {
 			mcopy = m_dup(m, IEEE80211_M_NOWAIT);
 			if (mcopy == NULL)
 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			else
 				mcopy->m_flags |= M_MCAST;
 		} else {
 			/*
 			 * Check if the destination is associated with the
 			 * same vap and authorized to receive traffic.
 			 * Beware of traffic destined for the vap itself;
 			 * sending it will not work; just let it be delivered
 			 * normally.
 			 */
 			struct ieee80211_node *sta = ieee80211_find_vap_node(
 			     &vap->iv_ic->ic_sta, vap, eh->ether_dhost);
 			if (sta != NULL) {
 				if (ieee80211_node_is_authorized(sta)) {
 					/*
 					 * Beware of sending to ourself; this
 					 * needs to happen via the normal
 					 * input path.
 					 */
 					if (sta != vap->iv_bss) {
 						mcopy = m;
 						m = NULL;
 					}
 				} else {
 					vap->iv_stats.is_rx_unauth++;
 					IEEE80211_NODE_STAT(sta, rx_unauth);
 				}
 				ieee80211_free_node(sta);
 			}
 		}
 		if (mcopy != NULL)
 			(void) ieee80211_vap_xmitpkt(vap, mcopy);
 	}
 	if (m != NULL) {
 		/*
 		 * Mark frame as coming from vap's interface.
 		 */
 		m->m_pkthdr.rcvif = ifp;
 		if (m->m_flags & M_MCAST) {
 			/*
 			 * Spam DWDS vap's w/ multicast traffic.
 			 */
 			/* XXX only if dwds in use? */
 			ieee80211_dwds_mcast(vap, m);
 		}
 		if (ni->ni_vlan != 0) {
 			/* attach vlan tag */
 			m->m_pkthdr.ether_vtag = ni->ni_vlan;
 			m->m_flags |= M_VLANTAG;
 		}
 		ifp->if_input(ifp, m);
 	}
 }
 
 /*
  * Decide if a received management frame should be
  * printed when debugging is enabled.  This filters some
  * of the less interesting frames that come frequently
  * (e.g. beacons).
  */
 static __inline int
 doprint(struct ieee80211vap *vap, int subtype)
 {
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_BEACON:
 		return (vap->iv_ic->ic_flags & IEEE80211_F_SCAN);
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 		return 0;
 	}
 	return 1;
 }
 
 /*
  * Process a received frame.  The node associated with the sender
  * should be supplied.  If nothing was found in the node table then
  * the caller is assumed to supply a reference to iv_bss instead.
  * The RSSI and a timestamp are also supplied.  The RSSI data is used
  * during AP scanning to select a AP to associate with; it can have
  * any units so long as values have consistent units and higher values
  * mean ``better signal''.  The receive timestamp is currently not used
  * by the 802.11 layer.
  */
 static int
 hostap_input(struct ieee80211_node *ni, struct mbuf *m,
     const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_frame *wh;
 	struct ieee80211_key *key;
 	struct ether_header *eh;
 	int hdrspace, need_tap = 1;	/* mbuf need to be tapped. */
 	uint8_t dir, type, subtype, qos;
 	uint8_t *bssid;
 	int is_hw_decrypted = 0;
 	int has_decrypted = 0;
 
 	/*
 	 * Some devices do hardware decryption all the way through
 	 * to pretending the frame wasn't encrypted in the first place.
 	 * So, tag it appropriately so it isn't discarded inappropriately.
 	 */
 	if ((rxs != NULL) && (rxs->c_pktflags & IEEE80211_RX_F_DECRYPTED))
 		is_hw_decrypted = 1;
 
 	if (m->m_flags & M_AMPDU_MPDU) {
 		/*
 		 * Fastpath for A-MPDU reorder q resubmission.  Frames
 		 * w/ M_AMPDU_MPDU marked have already passed through
 		 * here but were received out of order and been held on
 		 * the reorder queue.  When resubmitted they are marked
 		 * with the M_AMPDU_MPDU flag and we can bypass most of
 		 * the normal processing.
 		 */
 		wh = mtod(m, struct ieee80211_frame *);
 		type = IEEE80211_FC0_TYPE_DATA;
 		dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 		subtype = IEEE80211_FC0_SUBTYPE_QOS_DATA;
 		hdrspace = ieee80211_hdrspace(ic, wh);	/* XXX optimize? */
 		goto resubmit_ampdu;
 	}
 
 	KASSERT(ni != NULL, ("null node"));
 	ni->ni_inact = ni->ni_inact_reload;
 
 	type = -1;			/* undefined */
 
 	if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_min)) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL,
 		    "too short (1): len %u", m->m_pkthdr.len);
 		vap->iv_stats.is_rx_tooshort++;
 		goto out;
 	}
 	/*
 	 * Bit of a cheat here, we use a pointer for a 3-address
 	 * frame format but don't reference fields past outside
 	 * ieee80211_frame_min w/o first validating the data is
 	 * present.
 	 */
 	wh = mtod(m, struct ieee80211_frame *);
 
 	if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) !=
 	    IEEE80211_FC0_VERSION_0) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL, "wrong version, fc %02x:%02x",
 		    wh->i_fc[0], wh->i_fc[1]);
 		vap->iv_stats.is_rx_badversion++;
 		goto err;
 	}
 
 	dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 	type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK;
 	subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
 	if ((ic->ic_flags & IEEE80211_F_SCAN) == 0) {
 		if (dir != IEEE80211_FC1_DIR_NODS)
 			bssid = wh->i_addr1;
 		else if (type == IEEE80211_FC0_TYPE_CTL)
 			bssid = wh->i_addr1;
 		else {
 			if (m->m_pkthdr.len < sizeof(struct ieee80211_frame)) {
 				IEEE80211_DISCARD_MAC(vap,
 				    IEEE80211_MSG_ANY, ni->ni_macaddr,
 				    NULL, "too short (2): len %u",
 				    m->m_pkthdr.len);
 				vap->iv_stats.is_rx_tooshort++;
 				goto out;
 			}
 			bssid = wh->i_addr3;
 		}
 		/*
 		 * Validate the bssid.
 		 */
 		if (!(type == IEEE80211_FC0_TYPE_MGT &&
 		      subtype == IEEE80211_FC0_SUBTYPE_BEACON) &&
 		    !IEEE80211_ADDR_EQ(bssid, vap->iv_bss->ni_bssid) &&
 		    !IEEE80211_ADDR_EQ(bssid, ifp->if_broadcastaddr)) {
 			/* not interested in */
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    bssid, NULL, "%s", "not to bss");
 			vap->iv_stats.is_rx_wrongbss++;
 			goto out;
 		}
 
 		IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi);
 		ni->ni_noise = nf;
 		if (IEEE80211_HAS_SEQ(type, subtype)) {
 			uint8_t tid = ieee80211_gettid(wh);
 			if (IEEE80211_QOS_HAS_SEQ(wh) &&
 			    TID_TO_WME_AC(tid) >= WME_AC_VI)
 				ic->ic_wme.wme_hipri_traffic++;
 			if (! ieee80211_check_rxseq(ni, wh, bssid, rxs))
 				goto out;
 		}
 	}
 
 	switch (type) {
 	case IEEE80211_FC0_TYPE_DATA:
 		hdrspace = ieee80211_hdrspace(ic, wh);
 		if (m->m_len < hdrspace &&
 		    (m = m_pullup(m, hdrspace)) == NULL) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, NULL,
 			    "data too short: expecting %u", hdrspace);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;		/* XXX */
 		}
 		if (!(dir == IEEE80211_FC1_DIR_TODS ||
 		     (dir == IEEE80211_FC1_DIR_DSTODS &&
 		      (vap->iv_flags & IEEE80211_F_DWDS)))) {
 			if (dir != IEEE80211_FC1_DIR_DSTODS) {
 				IEEE80211_DISCARD(vap,
 				    IEEE80211_MSG_INPUT, wh, "data",
 				    "incorrect dir 0x%x", dir);
 			} else {
 				IEEE80211_DISCARD(vap,
 				    IEEE80211_MSG_INPUT |
 				    IEEE80211_MSG_WDS, wh,
 				    "4-address data",
 				    "%s", "DWDS not enabled");
 			}
 			vap->iv_stats.is_rx_wrongdir++;
 			goto out;
 		}
 		/* check if source STA is associated */
 		if (ni == vap->iv_bss) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "%s", "unknown src");
 			ieee80211_send_error(ni, wh->i_addr2,
 			    IEEE80211_FC0_SUBTYPE_DEAUTH,
 			    IEEE80211_REASON_NOT_AUTHED);
 			vap->iv_stats.is_rx_notassoc++;
 			goto err;
 		}
 		if (ni->ni_associd == 0) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "%s", "unassoc src");
 			IEEE80211_SEND_MGMT(ni,
 			    IEEE80211_FC0_SUBTYPE_DISASSOC,
 			    IEEE80211_REASON_NOT_ASSOCED);
 			vap->iv_stats.is_rx_notassoc++;
 			goto err;
 		}
 
 		/*
 		 * Check for power save state change.
 		 * XXX out-of-order A-MPDU frames?
 		 */
 		if (((wh->i_fc[1] & IEEE80211_FC1_PWR_MGT) ^
 		    (ni->ni_flags & IEEE80211_NODE_PWR_MGT)))
 			vap->iv_node_ps(ni,
 				wh->i_fc[1] & IEEE80211_FC1_PWR_MGT);
 		/*
 		 * For 4-address packets handle WDS discovery
 		 * notifications.  Once a WDS link is setup frames
 		 * are just delivered to the WDS vap (see below).
 		 */
 		if (dir == IEEE80211_FC1_DIR_DSTODS && ni->ni_wdsvap == NULL) {
 			if (!ieee80211_node_is_authorized(ni)) {
 				IEEE80211_DISCARD(vap,
 				    IEEE80211_MSG_INPUT |
 				    IEEE80211_MSG_WDS, wh,
 				    "4-address data",
 				    "%s", "unauthorized port");
 				vap->iv_stats.is_rx_unauth++;
 				IEEE80211_NODE_STAT(ni, rx_unauth);
 				goto err;
 			}
 			ieee80211_dwds_discover(ni, m);
 			return type;
 		}
 
 		/*
 		 * Handle A-MPDU re-ordering.  If the frame is to be
 		 * processed directly then ieee80211_ampdu_reorder
 		 * will return 0; otherwise it has consumed the mbuf
 		 * and we should do nothing more with it.
 		 */
 		if ((m->m_flags & M_AMPDU) &&
 		    ieee80211_ampdu_reorder(ni, m, rxs) != 0) {
 			m = NULL;
 			goto out;
 		}
 	resubmit_ampdu:
 
 		/*
 		 * Handle privacy requirements.  Note that we
 		 * must not be preempted from here until after
 		 * we (potentially) call ieee80211_crypto_demic;
 		 * otherwise we may violate assumptions in the
 		 * crypto cipher modules used to do delayed update
 		 * of replay sequence numbers.
 		 */
 		if (is_hw_decrypted || IEEE80211_IS_PROTECTED(wh)) {
 			if ((vap->iv_flags & IEEE80211_F_PRIVACY) == 0) {
 				/*
 				 * Discard encrypted frames when privacy is off.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 				    wh, "WEP", "%s", "PRIVACY off");
 				vap->iv_stats.is_rx_noprivacy++;
 				IEEE80211_NODE_STAT(ni, rx_noprivacy);
 				goto out;
 			}
 			if (ieee80211_crypto_decap(ni, m, hdrspace, &key) == 0) {
 				/* NB: stats+msgs handled in crypto_decap */
 				IEEE80211_NODE_STAT(ni, rx_wepfail);
 				goto out;
 			}
 			wh = mtod(m, struct ieee80211_frame *);
 			wh->i_fc[1] &= ~IEEE80211_FC1_PROTECTED;
 			has_decrypted = 1;
 		} else {
 			/* XXX M_WEP and IEEE80211_F_PRIVACY */
 			key = NULL;
 		}
 
 		/*
 		 * Save QoS bits for use below--before we strip the header.
 		 */
 		if (subtype == IEEE80211_FC0_SUBTYPE_QOS_DATA)
 			qos = ieee80211_getqos(wh)[0];
 		else
 			qos = 0;
 
 		/*
 		 * Next up, any fragmentation.
 		 */
 		if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			m = ieee80211_defrag(ni, m, hdrspace, has_decrypted);
 			if (m == NULL) {
 				/* Fragment dropped or frame not complete yet */
 				goto out;
 			}
 		}
 		wh = NULL;		/* no longer valid, catch any uses */
 
 		/*
 		 * Next strip any MSDU crypto bits.
 		 */
 		if (key != NULL && !ieee80211_crypto_demic(vap, key, m, 0)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "demic error");
 			vap->iv_stats.is_rx_demicfail++;
 			IEEE80211_NODE_STAT(ni, rx_demicfail);
 			goto out;
 		}
 		/* copy to listener after decrypt */
 		if (ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		need_tap = 0;
 		/*
 		 * Finally, strip the 802.11 header.
 		 */
 		m = ieee80211_decap(vap, m, hdrspace, qos);
 		if (m == NULL) {
 			/* XXX mask bit to check for both */
 			/* don't count Null data frames as errors */
 			if (subtype == IEEE80211_FC0_SUBTYPE_NODATA ||
 			    subtype == IEEE80211_FC0_SUBTYPE_QOS_NULL)
 				goto out;
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "decap error");
 			vap->iv_stats.is_rx_decap++;
 			IEEE80211_NODE_STAT(ni, rx_decap);
 			goto err;
 		}
 		if (!(qos & IEEE80211_QOS_AMSDU))
 			eh = mtod(m, struct ether_header *);
 		else
 			eh = NULL;
 		if (!ieee80211_node_is_authorized(ni)) {
 			/*
 			 * Deny any non-PAE frames received prior to
 			 * authorization.  For open/shared-key
 			 * authentication the port is mark authorized
 			 * after authentication completes.  For 802.1x
 			 * the port is not marked authorized by the
 			 * authenticator until the handshake has completed.
 			 */
 			if (eh == NULL ||
 			    eh->ether_type != htons(ETHERTYPE_PAE)) {
 				IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 				    ni->ni_macaddr, "data", "unauthorized or "
 				    "unknown port: ether type 0x%x len %u",
 				    eh == NULL ? -1 : eh->ether_type,
 				    m->m_pkthdr.len);
 				vap->iv_stats.is_rx_unauth++;
 				IEEE80211_NODE_STAT(ni, rx_unauth);
 				goto err;
 			}
 		} else {
 			/*
 			 * When denying unencrypted frames, discard
 			 * any non-PAE frames received without encryption.
 			 */
 			if ((vap->iv_flags & IEEE80211_F_DROPUNENC) &&
 			    ((has_decrypted == 0) && (m->m_flags & M_WEP) == 0) &&
 			    (is_hw_decrypted == 0) &&
 			    (eh == NULL ||
 			     eh->ether_type != htons(ETHERTYPE_PAE))) {
 				/*
 				 * Drop unencrypted frames.
 				 */
 				vap->iv_stats.is_rx_unencrypted++;
 				IEEE80211_NODE_STAT(ni, rx_unencrypted);
 				goto out;
 			}
 		}
 		/* XXX require HT? */
 		if (qos & IEEE80211_QOS_AMSDU) {
 			m = ieee80211_decap_amsdu(ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 		} else {
 #ifdef IEEE80211_SUPPORT_SUPERG
 			m = ieee80211_decap_fastframe(vap, ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 #endif
 		}
 		if (dir == IEEE80211_FC1_DIR_DSTODS && ni->ni_wdsvap != NULL)
 			ieee80211_deliver_data(ni->ni_wdsvap, ni, m);
 		else
 			hostap_deliver_data(vap, ni, m);
 		return IEEE80211_FC0_TYPE_DATA;
 
 	case IEEE80211_FC0_TYPE_MGT:
 		vap->iv_stats.is_rx_mgmt++;
 		IEEE80211_NODE_STAT(ni, rx_mgmt);
 		if (dir != IEEE80211_FC1_DIR_NODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "mgt", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto err;
 		}
 		if (m->m_pkthdr.len < sizeof(struct ieee80211_frame)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, "mgt", "too short: len %u",
 			    m->m_pkthdr.len);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;
 		}
 		if (IEEE80211_IS_MULTICAST(wh->i_addr2)) {
 			/* ensure return frames are unicast */
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 			    wh, NULL, "source is multicast: %s",
 			    ether_sprintf(wh->i_addr2));
 			vap->iv_stats.is_rx_mgtdiscard++;	/* XXX stat */
 			goto out;
 		}
 #ifdef IEEE80211_DEBUG
 		if ((ieee80211_msg_debug(vap) && doprint(vap, subtype)) ||
 		    ieee80211_msg_dumppkts(vap)) {
 			if_printf(ifp, "received %s from %s rssi %d\n",
 			    ieee80211_mgt_subtype_name(subtype),
 			    ether_sprintf(wh->i_addr2), rssi);
 		}
 #endif
 		if (IEEE80211_IS_PROTECTED(wh)) {
 			if (subtype != IEEE80211_FC0_SUBTYPE_AUTH) {
 				/*
 				 * Only shared key auth frames with a challenge
 				 * should be encrypted, discard all others.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 				    wh, NULL,
 				    "%s", "WEP set but not permitted");
 				vap->iv_stats.is_rx_mgtdiscard++; /* XXX */
 				goto out;
 			}
 			if ((vap->iv_flags & IEEE80211_F_PRIVACY) == 0) {
 				/*
 				 * Discard encrypted frames when privacy is off.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 				    wh, NULL, "%s", "WEP set but PRIVACY off");
 				vap->iv_stats.is_rx_noprivacy++;
 				goto out;
 			}
 			hdrspace = ieee80211_hdrspace(ic, wh);
 			if (ieee80211_crypto_decap(ni, m, hdrspace, &key) == 0) {
 				/* NB: stats+msgs handled in crypto_decap */
 				goto out;
 			}
 			wh = mtod(m, struct ieee80211_frame *);
 			wh->i_fc[1] &= ~IEEE80211_FC1_PROTECTED;
 			has_decrypted = 1;
 		}
 		/*
 		 * Pass the packet to radiotap before calling iv_recv_mgmt().
 		 * Otherwise iv_recv_mgmt() might pass another packet to
 		 * radiotap, resulting in out of order packet captures.
 		 */
 		if (ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		need_tap = 0;
 		vap->iv_recv_mgmt(ni, m, subtype, rxs, rssi, nf);
 		goto out;
 
 	case IEEE80211_FC0_TYPE_CTL:
 		vap->iv_stats.is_rx_ctl++;
 		IEEE80211_NODE_STAT(ni, rx_ctrl);
 		vap->iv_recv_ctl(ni, m, subtype);
 		goto out;
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "bad", "frame type 0x%x", type);
 		/* should not come here */
 		break;
 	}
 err:
 	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 out:
 	if (m != NULL) {
 		if (need_tap && ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		m_freem(m);
 	}
 	return type;
 }
 
 static void
 hostap_auth_open(struct ieee80211_node *ni, struct ieee80211_frame *wh,
     int rssi, int nf, uint16_t seq, uint16_t status)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 
 	KASSERT(vap->iv_state == IEEE80211_S_RUN, ("state %d", vap->iv_state));
 
 	if (ni->ni_authmode == IEEE80211_AUTH_SHARED) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 		    ni->ni_macaddr, "open auth",
 		    "bad sta auth mode %u", ni->ni_authmode);
 		vap->iv_stats.is_rx_bad_auth++;	/* XXX */
 		/*
 		 * Clear any challenge text that may be there if
 		 * a previous shared key auth failed and then an
 		 * open auth is attempted.
 		 */
 		if (ni->ni_challenge != NULL) {
 			IEEE80211_FREE(ni->ni_challenge, M_80211_NODE);
 			ni->ni_challenge = NULL;
 		}
 		/* XXX hack to workaround calling convention */
 		ieee80211_send_error(ni, wh->i_addr2, 
 		    IEEE80211_FC0_SUBTYPE_AUTH,
 		    (seq + 1) | (IEEE80211_STATUS_ALG<<16));
 		return;
 	}
 	if (seq != IEEE80211_AUTH_OPEN_REQUEST) {
 		vap->iv_stats.is_rx_bad_auth++;
 		return;
 	}
 	/* always accept open authentication requests */
 	if (ni == vap->iv_bss) {
 		ni = ieee80211_dup_bss(vap, wh->i_addr2);
 		if (ni == NULL)
 			return;
 	} else if ((ni->ni_flags & IEEE80211_NODE_AREF) == 0)
 		(void) ieee80211_ref_node(ni);
 	/*
 	 * Mark the node as referenced to reflect that it's
 	 * reference count has been bumped to insure it remains
 	 * after the transaction completes.
 	 */
 	ni->ni_flags |= IEEE80211_NODE_AREF;
 	/*
 	 * Mark the node as requiring a valid association id
 	 * before outbound traffic is permitted.
 	 */
 	ni->ni_flags |= IEEE80211_NODE_ASSOCID;
 
 	if (vap->iv_acl != NULL &&
 	    vap->iv_acl->iac_getpolicy(vap) == IEEE80211_MACCMD_POLICY_RADIUS) {
 		/*
 		 * When the ACL policy is set to RADIUS we defer the
 		 * authorization to a user agent.  Dispatch an event,
 		 * a subsequent MLME call will decide the fate of the
 		 * station.  If the user agent is not present then the
 		 * node will be reclaimed due to inactivity.
 		 */
 		IEEE80211_NOTE_MAC(vap,
 		    IEEE80211_MSG_AUTH | IEEE80211_MSG_ACL, ni->ni_macaddr,
 		    "%s", "station authentication defered (radius acl)");
 		ieee80211_notify_node_auth(ni);
 	} else {
 		IEEE80211_SEND_MGMT(ni, IEEE80211_FC0_SUBTYPE_AUTH, seq + 1);
 		IEEE80211_NOTE_MAC(vap,
 		    IEEE80211_MSG_DEBUG | IEEE80211_MSG_AUTH, ni->ni_macaddr,
 		    "%s", "station authenticated (open)");
 		/*
 		 * When 802.1x is not in use mark the port
 		 * authorized at this point so traffic can flow.
 		 */
 		if (ni->ni_authmode != IEEE80211_AUTH_8021X)
 			ieee80211_node_authorize(ni);
 	}
 }
 
 static void
 hostap_auth_shared(struct ieee80211_node *ni, struct ieee80211_frame *wh,
     uint8_t *frm, uint8_t *efrm, int rssi, int nf,
     uint16_t seq, uint16_t status)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	uint8_t *challenge;
 	int estatus;
 
 	KASSERT(vap->iv_state == IEEE80211_S_RUN, ("state %d", vap->iv_state));
 
 	/*
 	 * NB: this can happen as we allow pre-shared key
 	 * authentication to be enabled w/o wep being turned
 	 * on so that configuration of these can be done
 	 * in any order.  It may be better to enforce the
 	 * ordering in which case this check would just be
 	 * for sanity/consistency.
 	 */
 	if ((vap->iv_flags & IEEE80211_F_PRIVACY) == 0) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 		    ni->ni_macaddr, "shared key auth",
 		    "%s", " PRIVACY is disabled");
 		estatus = IEEE80211_STATUS_ALG;
 		goto bad;
 	}
 	/*
 	 * Pre-shared key authentication is evil; accept
 	 * it only if explicitly configured (it is supported
 	 * mainly for compatibility with clients like Mac OS X).
 	 */
 	if (ni->ni_authmode != IEEE80211_AUTH_AUTO &&
 	    ni->ni_authmode != IEEE80211_AUTH_SHARED) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 		    ni->ni_macaddr, "shared key auth",
 		    "bad sta auth mode %u", ni->ni_authmode);
 		vap->iv_stats.is_rx_bad_auth++;	/* XXX maybe a unique error? */
 		estatus = IEEE80211_STATUS_ALG;
 		goto bad;
 	}
 
 	challenge = NULL;
 	if (frm + 1 < efrm) {
 		if ((frm[1] + 2) > (efrm - frm)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 			    ni->ni_macaddr, "shared key auth",
 			    "ie %d/%d too long",
 			    frm[0], (frm[1] + 2) - (efrm - frm));
 			vap->iv_stats.is_rx_bad_auth++;
 			estatus = IEEE80211_STATUS_CHALLENGE;
 			goto bad;
 		}
 		if (*frm == IEEE80211_ELEMID_CHALLENGE)
 			challenge = frm;
 		frm += frm[1] + 2;
 	}
 	switch (seq) {
 	case IEEE80211_AUTH_SHARED_CHALLENGE:
 	case IEEE80211_AUTH_SHARED_RESPONSE:
 		if (challenge == NULL) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 			    ni->ni_macaddr, "shared key auth",
 			    "%s", "no challenge");
 			vap->iv_stats.is_rx_bad_auth++;
 			estatus = IEEE80211_STATUS_CHALLENGE;
 			goto bad;
 		}
 		if (challenge[1] != IEEE80211_CHALLENGE_LEN) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 			    ni->ni_macaddr, "shared key auth",
 			    "bad challenge len %d", challenge[1]);
 			vap->iv_stats.is_rx_bad_auth++;
 			estatus = IEEE80211_STATUS_CHALLENGE;
 			goto bad;
 		}
 	default:
 		break;
 	}
 	switch (seq) {
 	case IEEE80211_AUTH_SHARED_REQUEST:
 	{
 #ifdef IEEE80211_DEBUG
 		bool allocbs;
 #endif
 
 		if (ni == vap->iv_bss) {
 			ni = ieee80211_dup_bss(vap, wh->i_addr2);
 			if (ni == NULL) {
 				/* NB: no way to return an error */
 				return;
 			}
 #ifdef IEEE80211_DEBUG
 			allocbs = 1;
 #endif
 		} else {
 			if ((ni->ni_flags & IEEE80211_NODE_AREF) == 0)
 				(void) ieee80211_ref_node(ni);
 #ifdef IEEE80211_DEBUG
 			allocbs = 0;
 #endif
 		}
 		/*
 		 * Mark the node as referenced to reflect that it's
 		 * reference count has been bumped to insure it remains
 		 * after the transaction completes.
 		 */
 		ni->ni_flags |= IEEE80211_NODE_AREF;
 		/*
 		 * Mark the node as requiring a valid association id
 		 * before outbound traffic is permitted.
 		 */
 		ni->ni_flags |= IEEE80211_NODE_ASSOCID;
 		IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi);
 		ni->ni_noise = nf;
 		if (!ieee80211_alloc_challenge(ni)) {
 			/* NB: don't return error so they rexmit */
 			return;
 		}
 		net80211_get_random_bytes(ni->ni_challenge,
 			IEEE80211_CHALLENGE_LEN);
 		IEEE80211_NOTE(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_AUTH,
 		    ni, "shared key %sauth request", allocbs ? "" : "re");
 		/*
 		 * When the ACL policy is set to RADIUS we defer the
 		 * authorization to a user agent.  Dispatch an event,
 		 * a subsequent MLME call will decide the fate of the
 		 * station.  If the user agent is not present then the
 		 * node will be reclaimed due to inactivity.
 		 */
 		if (vap->iv_acl != NULL &&
 		    vap->iv_acl->iac_getpolicy(vap) == IEEE80211_MACCMD_POLICY_RADIUS) {
 			IEEE80211_NOTE_MAC(vap,
 			    IEEE80211_MSG_AUTH | IEEE80211_MSG_ACL,
 			    ni->ni_macaddr,
 			    "%s", "station authentication defered (radius acl)");
 			ieee80211_notify_node_auth(ni);
 			return;
 		}
 		break;
 	}
 	case IEEE80211_AUTH_SHARED_RESPONSE:
 		if (ni == vap->iv_bss) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 			    ni->ni_macaddr, "shared key response",
 			    "%s", "unknown station");
 			/* NB: don't send a response */
 			return;
 		}
 		if (ni->ni_challenge == NULL) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 			    ni->ni_macaddr, "shared key response",
 			    "%s", "no challenge recorded");
 			vap->iv_stats.is_rx_bad_auth++;
 			estatus = IEEE80211_STATUS_CHALLENGE;
 			goto bad;
 		}
 		if (memcmp(ni->ni_challenge, &challenge[2],
 			   challenge[1]) != 0) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 			    ni->ni_macaddr, "shared key response",
 			    "%s", "challenge mismatch");
 			vap->iv_stats.is_rx_auth_fail++;
 			estatus = IEEE80211_STATUS_CHALLENGE;
 			goto bad;
 		}
 		IEEE80211_NOTE(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_AUTH,
 		    ni, "%s", "station authenticated (shared key)");
 		ieee80211_node_authorize(ni);
 		break;
 	default:
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 		    ni->ni_macaddr, "shared key auth",
 		    "bad seq %d", seq);
 		vap->iv_stats.is_rx_bad_auth++;
 		estatus = IEEE80211_STATUS_SEQUENCE;
 		goto bad;
 	}
 	IEEE80211_SEND_MGMT(ni, IEEE80211_FC0_SUBTYPE_AUTH, seq + 1);
 	return;
 bad:
 	/*
 	 * Send an error response; but only when operating as an AP.
 	 */
 	/* XXX hack to workaround calling convention */
 	ieee80211_send_error(ni, wh->i_addr2,
 	    IEEE80211_FC0_SUBTYPE_AUTH,
 	    (seq + 1) | (estatus<<16));
 }
 
 /*
  * Convert a WPA cipher selector OUI to an internal
  * cipher algorithm.  Where appropriate we also
  * record any key length.
  */
 static int
 wpa_cipher(const uint8_t *sel, uint8_t *keylen, uint8_t *cipher)
 {
 #define	WPA_SEL(x)	(((x)<<24)|WPA_OUI)
 	uint32_t w = le32dec(sel);
 
 	switch (w) {
 	case WPA_SEL(WPA_CSE_NULL):
 		*cipher = IEEE80211_CIPHER_NONE;
 		break;
 	case WPA_SEL(WPA_CSE_WEP40):
 		if (keylen)
 			*keylen = 40 / NBBY;
 		*cipher = IEEE80211_CIPHER_WEP;
 		break;
 	case WPA_SEL(WPA_CSE_WEP104):
 		if (keylen)
 			*keylen = 104 / NBBY;
 		*cipher = IEEE80211_CIPHER_WEP;
 		break;
 	case WPA_SEL(WPA_CSE_TKIP):
 		*cipher = IEEE80211_CIPHER_TKIP;
 		break;
 	case WPA_SEL(WPA_CSE_CCMP):
 		*cipher = IEEE80211_CIPHER_AES_CCM;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 #undef WPA_SEL
 }
 
 /*
  * Convert a WPA key management/authentication algorithm
  * to an internal code.
  */
 static int
 wpa_keymgmt(const uint8_t *sel)
 {
 #define	WPA_SEL(x)	(((x)<<24)|WPA_OUI)
 	uint32_t w = le32dec(sel);
 
 	switch (w) {
 	case WPA_SEL(WPA_ASE_8021X_UNSPEC):
 		return WPA_ASE_8021X_UNSPEC;
 	case WPA_SEL(WPA_ASE_8021X_PSK):
 		return WPA_ASE_8021X_PSK;
 	case WPA_SEL(WPA_ASE_NONE):
 		return WPA_ASE_NONE;
 	}
 	return 0;		/* NB: so is discarded */
 #undef WPA_SEL
 }
 
 /*
  * Parse a WPA information element to collect parameters.
  * Note that we do not validate security parameters; that
  * is handled by the authenticator; the parsing done here
  * is just for internal use in making operational decisions.
  */
 static int
 ieee80211_parse_wpa(struct ieee80211vap *vap, const uint8_t *frm,
 	struct ieee80211_rsnparms *rsn, const struct ieee80211_frame *wh)
 {
 	uint8_t len = frm[1];
 	uint32_t w;
 	int error, n;
 
 	/*
 	 * Check the length once for fixed parts: OUI, type,
 	 * version, mcast cipher, and 2 selector counts.
 	 * Other, variable-length data, must be checked separately.
 	 */
 	if ((vap->iv_flags & IEEE80211_F_WPA1) == 0) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "WPA", "not WPA, flags 0x%x", vap->iv_flags);
 		return IEEE80211_REASON_IE_INVALID;
 	}
 	if (len < 14) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "WPA", "too short, len %u", len);
 		return IEEE80211_REASON_IE_INVALID;
 	}
 	frm += 6, len -= 4;		/* NB: len is payload only */
 	/* NB: iswpaoui already validated the OUI and type */
 	w = le16dec(frm);
 	if (w != WPA_VERSION) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "WPA", "bad version %u", w);
 		return IEEE80211_REASON_IE_INVALID;
 	}
 	frm += 2, len -= 2;
 
 	memset(rsn, 0, sizeof(*rsn));
 
 	/* multicast/group cipher */
 	error = wpa_cipher(frm, &rsn->rsn_mcastkeylen, &rsn->rsn_mcastcipher);
 	if (error != 0) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "WPA", "unknown mcast cipher suite %08X",
 		    le32dec(frm));
 		return IEEE80211_REASON_GROUP_CIPHER_INVALID;
 	}
 	frm += 4, len -= 4;
 
 	/* unicast ciphers */
 	n = le16dec(frm);
 	frm += 2, len -= 2;
 	if (len < n*4+2) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "WPA", "ucast cipher data too short; len %u, n %u",
 		    len, n);
 		return IEEE80211_REASON_IE_INVALID;
 	}
 	w = 0;
 	for (; n > 0; n--) {
 		uint8_t cipher;
 
 		error = wpa_cipher(frm, &rsn->rsn_ucastkeylen, &cipher);
 		if (error == 0)
 			w |= 1 << cipher;
 
 		frm += 4, len -= 4;
 	}
 	if (w == 0) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "WPA", "no usable pairwise cipher suite found (w=%d)",
 		    w);
 		return IEEE80211_REASON_PAIRWISE_CIPHER_INVALID;
 	}
 	/* XXX other? */
 	if (w & (1 << IEEE80211_CIPHER_AES_CCM))
 		rsn->rsn_ucastcipher = IEEE80211_CIPHER_AES_CCM;
 	else
 		rsn->rsn_ucastcipher = IEEE80211_CIPHER_TKIP;
 
 	/* key management algorithms */
 	n = le16dec(frm);
 	frm += 2, len -= 2;
 	if (len < n*4) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "WPA", "key mgmt alg data too short; len %u, n %u",
 		    len, n);
 		return IEEE80211_REASON_IE_INVALID;
 	}
 	w = 0;
 	for (; n > 0; n--) {
 		w |= wpa_keymgmt(frm);
 		frm += 4, len -= 4;
 	}
 	if (w & WPA_ASE_8021X_UNSPEC)
 		rsn->rsn_keymgmt = WPA_ASE_8021X_UNSPEC;
 	else
 		rsn->rsn_keymgmt = WPA_ASE_8021X_PSK;
 
 	if (len > 2)		/* optional capabilities */
 		rsn->rsn_caps = le16dec(frm);
 
 	return 0;
 }
 
 /*
  * Convert an RSN cipher selector OUI to an internal
  * cipher algorithm.  Where appropriate we also
  * record any key length.
  */
 static int
 rsn_cipher(const uint8_t *sel, uint8_t *keylen, uint8_t *cipher)
 {
 #define	RSN_SEL(x)	(((x)<<24)|RSN_OUI)
 	uint32_t w = le32dec(sel);
 
 	switch (w) {
 	case RSN_SEL(RSN_CSE_NULL):
 		*cipher = IEEE80211_CIPHER_NONE;
 		break;
 	case RSN_SEL(RSN_CSE_WEP40):
 		if (keylen)
 			*keylen = 40 / NBBY;
 		*cipher = IEEE80211_CIPHER_WEP;
 		break;
 	case RSN_SEL(RSN_CSE_WEP104):
 		if (keylen)
 			*keylen = 104 / NBBY;
 		*cipher = IEEE80211_CIPHER_WEP;
 		break;
 	case RSN_SEL(RSN_CSE_TKIP):
 		*cipher = IEEE80211_CIPHER_TKIP;
 		break;
 	case RSN_SEL(RSN_CSE_CCMP):
 		*cipher = IEEE80211_CIPHER_AES_CCM;
 		break;
 	case RSN_SEL(RSN_CSE_WRAP):
 		*cipher = IEEE80211_CIPHER_AES_OCB;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 #undef WPA_SEL
 }
 
 /*
  * Convert an RSN key management/authentication algorithm
  * to an internal code.
  */
 static int
 rsn_keymgmt(const uint8_t *sel)
 {
 #define	RSN_SEL(x)	(((x)<<24)|RSN_OUI)
 	uint32_t w = le32dec(sel);
 
 	switch (w) {
 	case RSN_SEL(RSN_ASE_8021X_UNSPEC):
 		return RSN_ASE_8021X_UNSPEC;
 	case RSN_SEL(RSN_ASE_8021X_PSK):
 		return RSN_ASE_8021X_PSK;
 	case RSN_SEL(RSN_ASE_NONE):
 		return RSN_ASE_NONE;
 	}
 	return 0;		/* NB: so is discarded */
 #undef RSN_SEL
 }
 
 /*
  * Parse a WPA/RSN information element to collect parameters
  * and validate the parameters against what has been
  * configured for the system.
  */
 static int
 ieee80211_parse_rsn(struct ieee80211vap *vap, const uint8_t *frm,
 	struct ieee80211_rsnparms *rsn, const struct ieee80211_frame *wh)
 {
 	uint8_t len = frm[1];
 	uint32_t w;
 	int error, n;
 
 	/*
 	 * Check the length once for fixed parts: 
 	 * version, mcast cipher, and 2 selector counts.
 	 * Other, variable-length data, must be checked separately.
 	 */
 	if ((vap->iv_flags & IEEE80211_F_WPA2) == 0) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "WPA", "not RSN, flags 0x%x", vap->iv_flags);
 		return IEEE80211_REASON_IE_INVALID;
 	}
 	/* XXX may be shorter */
 	if (len < 10) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "RSN", "too short, len %u", len);
 		return IEEE80211_REASON_IE_INVALID;
 	}
 	frm += 2;
 	w = le16dec(frm);
 	if (w != RSN_VERSION) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "RSN", "bad version %u", w);
 		return IEEE80211_REASON_UNSUPP_RSN_IE_VERSION;
 	}
 	frm += 2, len -= 2;
 
 	memset(rsn, 0, sizeof(*rsn));
 
 	/* multicast/group cipher */
 	error = rsn_cipher(frm, &rsn->rsn_mcastkeylen, &rsn->rsn_mcastcipher);
 	if (error != 0) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "RSN", "unknown mcast cipher suite %08X",
 		    le32dec(frm));
 		return IEEE80211_REASON_GROUP_CIPHER_INVALID;
 	}
 	if (rsn->rsn_mcastcipher == IEEE80211_CIPHER_NONE) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "RSN", "invalid mcast cipher suite %d",
 		    rsn->rsn_mcastcipher);
 		return IEEE80211_REASON_GROUP_CIPHER_INVALID;
 	}
 	frm += 4, len -= 4;
 
 	/* unicast ciphers */
 	n = le16dec(frm);
 	frm += 2, len -= 2;
 	if (len < n*4+2) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "RSN", "ucast cipher data too short; len %u, n %u",
 		    len, n);
 		return IEEE80211_REASON_IE_INVALID;
 	}
 	w = 0;
 
 	for (; n > 0; n--) {
 		uint8_t cipher;
 
 		error = rsn_cipher(frm, &rsn->rsn_ucastkeylen, &cipher);
 		if (error == 0)
 			w |= 1 << cipher;
 
 		frm += 4, len -= 4;
 	}
         if (w & (1 << IEEE80211_CIPHER_AES_CCM))
                 rsn->rsn_ucastcipher = IEEE80211_CIPHER_AES_CCM;
 	else if (w & (1 << IEEE80211_CIPHER_AES_OCB))
 		rsn->rsn_ucastcipher = IEEE80211_CIPHER_AES_OCB;
 	else if (w & (1 << IEEE80211_CIPHER_TKIP))
 		rsn->rsn_ucastcipher = IEEE80211_CIPHER_TKIP;
 	else if ((w & (1 << IEEE80211_CIPHER_NONE)) &&
 	    (rsn->rsn_mcastcipher == IEEE80211_CIPHER_WEP ||
 	     rsn->rsn_mcastcipher == IEEE80211_CIPHER_TKIP))
 		rsn->rsn_ucastcipher = IEEE80211_CIPHER_NONE;
 	else {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "RSN", "no usable pairwise cipher suite found (w=%d)",
 		    w);
 		return IEEE80211_REASON_PAIRWISE_CIPHER_INVALID;
 	}
 
 	/* key management algorithms */
 	n = le16dec(frm);
 	frm += 2, len -= 2;
 	if (len < n*4) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WPA,
 		    wh, "RSN", "key mgmt alg data too short; len %u, n %u",
 		    len, n);
 		return IEEE80211_REASON_IE_INVALID;
 	}
 	w = 0;
 	for (; n > 0; n--) {
 		w |= rsn_keymgmt(frm);
 		frm += 4, len -= 4;
 	}
 	if (w & RSN_ASE_8021X_UNSPEC)
 		rsn->rsn_keymgmt = RSN_ASE_8021X_UNSPEC;
 	else
 		rsn->rsn_keymgmt = RSN_ASE_8021X_PSK;
 
 	/* optional RSN capabilities */
 	if (len > 2)
 		rsn->rsn_caps = le16dec(frm);
 	/* XXXPMKID */
 
 	return 0;
 }
 
 /*
  * WPA/802.11i association request processing.
  */
 static int
 wpa_assocreq(struct ieee80211_node *ni, struct ieee80211_rsnparms *rsnparms,
 	const struct ieee80211_frame *wh, const uint8_t *wpa,
 	const uint8_t *rsn, uint16_t capinfo)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	uint8_t reason;
 	int badwparsn;
 
 	ni->ni_flags &= ~(IEEE80211_NODE_WPS|IEEE80211_NODE_TSN);
 	if (wpa == NULL && rsn == NULL) {
 		if (vap->iv_flags_ext & IEEE80211_FEXT_WPS) {
 			/*
 			 * W-Fi Protected Setup (WPS) permits
 			 * clients to associate and pass EAPOL frames
 			 * to establish initial credentials.
 			 */
 			ni->ni_flags |= IEEE80211_NODE_WPS;
 			return 1;
 		}
 		if ((vap->iv_flags_ext & IEEE80211_FEXT_TSN) &&
 		    (capinfo & IEEE80211_CAPINFO_PRIVACY)) {
 			/* 
 			 * Transitional Security Network.  Permits clients
 			 * to associate and use WEP while WPA is configured.
 			 */
 			ni->ni_flags |= IEEE80211_NODE_TSN;
 			return 1;
 		}
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ASSOC | IEEE80211_MSG_WPA,
 		    wh, NULL, "%s", "no WPA/RSN IE in association request");
 		vap->iv_stats.is_rx_assoc_badwpaie++;
 		reason = IEEE80211_REASON_IE_INVALID;
 		goto bad;
 	}
 	/* assert right association security credentials */
 	badwparsn = 0;			/* NB: to silence compiler */
 	switch (vap->iv_flags & IEEE80211_F_WPA) {
 	case IEEE80211_F_WPA1:
 		badwparsn = (wpa == NULL);
 		break;
 	case IEEE80211_F_WPA2:
 		badwparsn = (rsn == NULL);
 		break;
 	case IEEE80211_F_WPA1|IEEE80211_F_WPA2:
 		badwparsn = (wpa == NULL && rsn == NULL);
 		break;
 	}
 	if (badwparsn) {
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ASSOC | IEEE80211_MSG_WPA,
 		    wh, NULL,
 		    "%s", "missing WPA/RSN IE in association request");
 		vap->iv_stats.is_rx_assoc_badwpaie++;
 		reason = IEEE80211_REASON_IE_INVALID;
 		goto bad;
 	}
 	/*
 	 * Parse WPA/RSN information element.
 	 */
 	if (wpa != NULL)
 		reason = ieee80211_parse_wpa(vap, wpa, rsnparms, wh);
 	else
 		reason = ieee80211_parse_rsn(vap, rsn, rsnparms, wh);
 	if (reason != 0) {
 		/* XXX wpa->rsn fallback? */
 		/* XXX distinguish WPA/RSN? */
 		vap->iv_stats.is_rx_assoc_badwpaie++;
 		goto bad;
 	}
 	IEEE80211_NOTE(vap, IEEE80211_MSG_ASSOC | IEEE80211_MSG_WPA, ni,
 	    "%s ie: mc %u/%u uc %u/%u key %u caps 0x%x",
 	    wpa != NULL ? "WPA" : "RSN",
 	    rsnparms->rsn_mcastcipher, rsnparms->rsn_mcastkeylen,
 	    rsnparms->rsn_ucastcipher, rsnparms->rsn_ucastkeylen,
 	    rsnparms->rsn_keymgmt, rsnparms->rsn_caps);
 
 	return 1;
 bad:
 	ieee80211_node_deauth(ni, reason);
 	return 0;
 }
 
 /* XXX find a better place for definition */
 struct l2_update_frame {
 	struct ether_header eh;
 	uint8_t dsap;
 	uint8_t ssap;
 	uint8_t control;
 	uint8_t xid[3];
 }  __packed;
 
 /*
  * Deliver a TGf L2UF frame on behalf of a station.
  * This primes any bridge when the station is roaming
  * between ap's on the same wired network.
  */
 static void
 ieee80211_deliver_l2uf(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct mbuf *m;
 	struct l2_update_frame *l2uf;
 	struct ether_header *eh;
 
 	m = m_gethdr(IEEE80211_M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_ASSOC, ni,
 		    "%s", "no mbuf for l2uf frame");
 		vap->iv_stats.is_rx_nobuf++;	/* XXX not right */
 		return;
 	}
 	l2uf = mtod(m, struct l2_update_frame *);
 	eh = &l2uf->eh;
 	/* dst: Broadcast address */
 	IEEE80211_ADDR_COPY(eh->ether_dhost, ifp->if_broadcastaddr);
 	/* src: associated STA */
 	IEEE80211_ADDR_COPY(eh->ether_shost, ni->ni_macaddr);
 	eh->ether_type = htons(sizeof(*l2uf) - sizeof(*eh));
 
 	l2uf->dsap = 0;
 	l2uf->ssap = 0;
 	l2uf->control = 0xf5;
 	l2uf->xid[0] = 0x81;
 	l2uf->xid[1] = 0x80;
 	l2uf->xid[2] = 0x00;
 
 	m->m_pkthdr.len = m->m_len = sizeof(*l2uf);
 	hostap_deliver_data(vap, ni, m);
 }
 
 static void
 ratesetmismatch(struct ieee80211_node *ni, const struct ieee80211_frame *wh,
 	int reassoc, int resp, const char *tag, int rate)
 {
 	IEEE80211_NOTE_MAC(ni->ni_vap, IEEE80211_MSG_ANY, wh->i_addr2,
 	    "deny %s request, %s rate set mismatch, rate/MCS %d",
 	    reassoc ? "reassoc" : "assoc", tag, rate & IEEE80211_RATE_VAL);
 	IEEE80211_SEND_MGMT(ni, resp, IEEE80211_STATUS_BASIC_RATE);
 	ieee80211_node_leave(ni);
 }
 
 static void
 capinfomismatch(struct ieee80211_node *ni, const struct ieee80211_frame *wh,
 	int reassoc, int resp, const char *tag, int capinfo)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 
 	IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_ANY, wh->i_addr2,
 	    "deny %s request, %s mismatch 0x%x",
 	    reassoc ? "reassoc" : "assoc", tag, capinfo);
 	IEEE80211_SEND_MGMT(ni, resp, IEEE80211_STATUS_CAPINFO);
 	ieee80211_node_leave(ni);
 	vap->iv_stats.is_rx_assoc_capmismatch++;
 }
 
 static void
 htcapmismatch(struct ieee80211_node *ni, const struct ieee80211_frame *wh,
 	int reassoc, int resp)
 {
 	IEEE80211_NOTE_MAC(ni->ni_vap, IEEE80211_MSG_ANY, wh->i_addr2,
 	    "deny %s request, %s missing HT ie", reassoc ? "reassoc" : "assoc");
 	/* XXX no better code */
 	IEEE80211_SEND_MGMT(ni, resp, IEEE80211_STATUS_MISSING_HT_CAPS);
 	ieee80211_node_leave(ni);
 }
 
 static void
 authalgreject(struct ieee80211_node *ni, const struct ieee80211_frame *wh,
 	int algo, int seq, int status)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 
 	IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 	    wh, NULL, "unsupported alg %d", algo);
 	vap->iv_stats.is_rx_auth_unsupported++;
 	ieee80211_send_error(ni, wh->i_addr2, IEEE80211_FC0_SUBTYPE_AUTH,
 	    seq | (status << 16));
 }
 
 static __inline int
 ishtmixed(const uint8_t *ie)
 {
 	const struct ieee80211_ie_htinfo *ht =
 	    (const struct ieee80211_ie_htinfo *) ie;
 	return (ht->hi_byte2 & IEEE80211_HTINFO_OPMODE) ==
 	    IEEE80211_HTINFO_OPMODE_MIXED;
 }
 
 static int
 is11bclient(const uint8_t *rates, const uint8_t *xrates)
 {
 	static const uint32_t brates = (1<<2*1)|(1<<2*2)|(1<<11)|(1<<2*11);
 	int i;
 
 	/* NB: the 11b clients we care about will not have xrates */
 	if (xrates != NULL || rates == NULL)
 		return 0;
 	for (i = 0; i < rates[1]; i++) {
 		int r = rates[2+i] & IEEE80211_RATE_VAL;
 		if (r > 2*11 || ((1<<r) & brates) == 0)
 			return 0;
 	}
 	return 1;
 }
 
 static void
 hostap_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0,
 	int subtype, const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_frame *wh;
 	uint8_t *frm, *efrm, *sfrm;
 	uint8_t *ssid, *rates, *xrates, *wpa, *rsn, *wme, *ath, *htcap;
 	uint8_t *vhtcap, *vhtinfo;
 	int reassoc, resp;
 	uint8_t rate;
 
 	wh = mtod(m0, struct ieee80211_frame *);
 	frm = (uint8_t *)&wh[1];
 	efrm = mtod(m0, uint8_t *) + m0->m_len;
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 		/*
 		 * We process beacon/probe response frames when scanning;
 		 * otherwise we check beacon frames for overlapping non-ERP
 		 * BSS in 11g and/or overlapping legacy BSS when in HT.
 		 */
 		if ((ic->ic_flags & IEEE80211_F_SCAN) == 0) {
 			vap->iv_stats.is_rx_mgtdiscard++;
 			return;
 		}
 		/* FALLTHROUGH */
 	case IEEE80211_FC0_SUBTYPE_BEACON: {
 		struct ieee80211_scanparams scan;
 
 		/* NB: accept off-channel frames */
 		/* XXX TODO: use rxstatus to determine off-channel details */
 		if (ieee80211_parse_beacon(ni, m0, ic->ic_curchan, &scan) &~ IEEE80211_BPARSE_OFFCHAN)
 			return;
 		/*
 		 * Count frame now that we know it's to be processed.
 		 */
 		if (subtype == IEEE80211_FC0_SUBTYPE_BEACON) {
 			vap->iv_stats.is_rx_beacon++;		/* XXX remove */
 			IEEE80211_NODE_STAT(ni, rx_beacons);
 		} else
 			IEEE80211_NODE_STAT(ni, rx_proberesp);
 		/*
 		 * If scanning, just pass information to the scan module.
 		 */
 		if (ic->ic_flags & IEEE80211_F_SCAN) {
 			if (scan.status == 0 &&		/* NB: on channel */
 			    (ic->ic_flags_ext & IEEE80211_FEXT_PROBECHAN)) {
 				/*
 				 * Actively scanning a channel marked passive;
 				 * send a probe request now that we know there
 				 * is 802.11 traffic present.
 				 *
 				 * XXX check if the beacon we recv'd gives
 				 * us what we need and suppress the probe req
 				 */
 				ieee80211_probe_curchan(vap, 1);
 				ic->ic_flags_ext &= ~IEEE80211_FEXT_PROBECHAN;
 			}
 			ieee80211_add_scan(vap, ic->ic_curchan, &scan, wh,
 			    subtype, rssi, nf);
 			return;
 		}
 		/*
 		 * Check beacon for overlapping bss w/ non ERP stations.
 		 * If we detect one and protection is configured but not
 		 * enabled, enable it and start a timer that'll bring us
 		 * out if we stop seeing the bss.
 		 */
 		if (IEEE80211_IS_CHAN_ANYG(ic->ic_curchan) &&
 		    scan.status == 0 &&			/* NB: on-channel */
 		    ((scan.erp & 0x100) == 0 ||		/* NB: no ERP, 11b sta*/
 		     (scan.erp & IEEE80211_ERP_NON_ERP_PRESENT))) {
 			vap->iv_lastnonerp = ticks;
 			vap->iv_flags_ext |= IEEE80211_FEXT_NONERP_PR;
 			/*
 			 * XXX TODO: this may need to check all VAPs?
 			 */
 			if (vap->iv_protmode != IEEE80211_PROT_NONE &&
 			    (vap->iv_flags & IEEE80211_F_USEPROT) == 0) {
 				IEEE80211_NOTE_FRAME(vap,
 				    IEEE80211_MSG_ASSOC, wh,
 				    "non-ERP present on channel %d "
 				    "(saw erp 0x%x from channel %d), "
 				    "enable use of protection",
 				    ic->ic_curchan->ic_ieee,
 				    scan.erp, scan.chan);
 				vap->iv_flags |= IEEE80211_F_USEPROT;
 				ieee80211_vap_update_erp_protmode(vap);
 			}
 		}
 		/* 
 		 * Check beacon for non-HT station on HT channel
 		 * and update HT BSS occupancy as appropriate.
 		 */
 		if (IEEE80211_IS_CHAN_HT(ic->ic_curchan)) {
 			if (scan.status & IEEE80211_BPARSE_OFFCHAN) {
 				/*
 				 * Off control channel; only check frames
 				 * that come in the extension channel when
 				 * operating w/ HT40.
 				 */
 				if (!IEEE80211_IS_CHAN_HT40(ic->ic_curchan))
 					break;
 				if (scan.chan != ic->ic_curchan->ic_extieee)
 					break;
 			}
 			if (scan.htinfo == NULL) {
 				ieee80211_htprot_update(vap,
 				    IEEE80211_HTINFO_OPMODE_PROTOPT |
 				    IEEE80211_HTINFO_NONHT_PRESENT);
 			} else if (ishtmixed(scan.htinfo)) {
 				/* XXX? take NONHT_PRESENT from beacon? */
 				ieee80211_htprot_update(vap,
 				    IEEE80211_HTINFO_OPMODE_MIXED |
 				    IEEE80211_HTINFO_NONHT_PRESENT);
 			}
 		}
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 		if (vap->iv_state != IEEE80211_S_RUN) {
 			vap->iv_stats.is_rx_mgtdiscard++;
 			return;
 		}
 		/*
 		 * Consult the ACL policy module if setup.
 		 */
 		if (vap->iv_acl != NULL && !vap->iv_acl->iac_check(vap, wh)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ACL,
 			    wh, NULL, "%s", "disallowed by ACL");
 			vap->iv_stats.is_rx_acl++;
 			return;
 		}
 		/*
 		 * prreq frame format
 		 *	[tlv] ssid
 		 *	[tlv] supported rates
 		 *	[tlv] extended supported rates
 		 */
 		ssid = rates = xrates = NULL;
 		while (efrm - frm > 1) {
 			IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return);
 			switch (*frm) {
 			case IEEE80211_ELEMID_SSID:
 				ssid = frm;
 				break;
 			case IEEE80211_ELEMID_RATES:
 				rates = frm;
 				break;
 			case IEEE80211_ELEMID_XRATES:
 				xrates = frm;
 				break;
 			}
 			frm += frm[1] + 2;
 		}
 		IEEE80211_VERIFY_ELEMENT(rates, IEEE80211_RATE_MAXSIZE, return);
 		if (xrates != NULL)
 			IEEE80211_VERIFY_ELEMENT(xrates,
 				IEEE80211_RATE_MAXSIZE - rates[1], return);
 		IEEE80211_VERIFY_ELEMENT(ssid, IEEE80211_NWID_LEN, return);
 		IEEE80211_VERIFY_SSID(vap->iv_bss, ssid, return);
 		if ((vap->iv_flags & IEEE80211_F_HIDESSID) && ssid[1] == 0) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL,
 			    "%s", "no ssid with ssid suppression enabled");
 			vap->iv_stats.is_rx_ssidmismatch++; /*XXX*/
 			return;
 		}
 
 		/* XXX find a better class or define it's own */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_INPUT, wh->i_addr2,
 		    "%s", "recv probe req");
 		/*
 		 * Some legacy 11b clients cannot hack a complete
 		 * probe response frame.  When the request includes
 		 * only a bare-bones rate set, communicate this to
 		 * the transmit side.
 		 */
 		ieee80211_send_proberesp(vap, wh->i_addr2,
 		    is11bclient(rates, xrates) ? IEEE80211_SEND_LEGACY_11B : 0);
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_AUTH: {
 		uint16_t algo, seq, status;
 
 		if (vap->iv_state != IEEE80211_S_RUN) {
 			vap->iv_stats.is_rx_mgtdiscard++;
 			return;
 		}
 		if (!IEEE80211_ADDR_EQ(wh->i_addr3, vap->iv_bss->ni_bssid)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 			    wh, NULL, "%s", "wrong bssid");
 			vap->iv_stats.is_rx_wrongbss++;	/*XXX unique stat?*/
 			return;
 		}
 		/*
 		 * auth frame format
 		 *	[2] algorithm
 		 *	[2] sequence
 		 *	[2] status
 		 *	[tlv*] challenge
 		 */
 		IEEE80211_VERIFY_LENGTH(efrm - frm, 6, return);
 		algo   = le16toh(*(uint16_t *)frm);
 		seq    = le16toh(*(uint16_t *)(frm + 2));
 		status = le16toh(*(uint16_t *)(frm + 4));
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_AUTH, wh->i_addr2,
 		    "recv auth frame with algorithm %d seq %d", algo, seq);
 		/*
 		 * Consult the ACL policy module if setup.
 		 */
 		if (vap->iv_acl != NULL && !vap->iv_acl->iac_check(vap, wh)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ACL,
 			    wh, NULL, "%s", "disallowed by ACL");
 			vap->iv_stats.is_rx_acl++;
 			ieee80211_send_error(ni, wh->i_addr2,
 			    IEEE80211_FC0_SUBTYPE_AUTH,
 			    (seq+1) | (IEEE80211_STATUS_UNSPECIFIED<<16));
 			return;
 		}
 		if (vap->iv_flags & IEEE80211_F_COUNTERM) {
 			IEEE80211_DISCARD(vap,
 			    IEEE80211_MSG_AUTH | IEEE80211_MSG_CRYPTO,
 			    wh, NULL, "%s", "TKIP countermeasures enabled");
 			vap->iv_stats.is_rx_auth_countermeasures++;
 			ieee80211_send_error(ni, wh->i_addr2,
 				IEEE80211_FC0_SUBTYPE_AUTH,
 				IEEE80211_REASON_MIC_FAILURE);
 			return;
 		}
 		if (algo == IEEE80211_AUTH_ALG_SHARED)
 			hostap_auth_shared(ni, wh, frm + 6, efrm, rssi, nf,
 			    seq, status);
 		else if (algo == IEEE80211_AUTH_ALG_OPEN)
 			hostap_auth_open(ni, wh, rssi, nf, seq, status);
 		else if (algo == IEEE80211_AUTH_ALG_LEAP) {
 			authalgreject(ni, wh, algo,
 			    seq+1, IEEE80211_STATUS_ALG);
 			return;
 		} else {
 			/*
 			 * We assume that an unknown algorithm is the result
 			 * of a decryption failure on a shared key auth frame;
 			 * return a status code appropriate for that instead
 			 * of IEEE80211_STATUS_ALG.
 			 *
 			 * NB: a seq# of 4 is intentional; the decrypted
 			 *     frame likely has a bogus seq value.
 			 */
 			authalgreject(ni, wh, algo,
 			    4, IEEE80211_STATUS_CHALLENGE);
 			return;
 		} 
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_REQ: {
 		uint16_t capinfo, lintval;
 		struct ieee80211_rsnparms rsnparms;
 
 		if (vap->iv_state != IEEE80211_S_RUN) {
 			vap->iv_stats.is_rx_mgtdiscard++;
 			return;
 		}
 		if (!IEEE80211_ADDR_EQ(wh->i_addr3, vap->iv_bss->ni_bssid)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 			    wh, NULL, "%s", "wrong bssid");
 			vap->iv_stats.is_rx_assoc_bss++;
 			return;
 		}
 		if (subtype == IEEE80211_FC0_SUBTYPE_REASSOC_REQ) {
 			reassoc = 1;
 			resp = IEEE80211_FC0_SUBTYPE_REASSOC_RESP;
 		} else {
 			reassoc = 0;
 			resp = IEEE80211_FC0_SUBTYPE_ASSOC_RESP;
 		}
 		if (ni == vap->iv_bss) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_ANY, wh->i_addr2,
 			    "deny %s request, sta not authenticated",
 			    reassoc ? "reassoc" : "assoc");
 			ieee80211_send_error(ni, wh->i_addr2,
 			    IEEE80211_FC0_SUBTYPE_DEAUTH,
 			    IEEE80211_REASON_ASSOC_NOT_AUTHED);
 			vap->iv_stats.is_rx_assoc_notauth++;
 			return;
 		}
 
 		/*
 		 * asreq frame format
 		 *	[2] capability information
 		 *	[2] listen interval
 		 *	[6*] current AP address (reassoc only)
 		 *	[tlv] ssid
 		 *	[tlv] supported rates
 		 *	[tlv] extended supported rates
 		 *	[tlv] WPA or RSN
 		 *	[tlv] HT capabilities
 		 *	[tlv] Atheros capabilities
 		 */
 		IEEE80211_VERIFY_LENGTH(efrm - frm, (reassoc ? 10 : 4), return);
 		capinfo = le16toh(*(uint16_t *)frm);	frm += 2;
 		lintval = le16toh(*(uint16_t *)frm);	frm += 2;
 		if (reassoc)
 			frm += 6;	/* ignore current AP info */
 		ssid = rates = xrates = wpa = rsn = wme = ath = htcap = NULL;
 		vhtcap = vhtinfo = NULL;
 		sfrm = frm;
 		while (efrm - frm > 1) {
 			IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return);
 			switch (*frm) {
 			case IEEE80211_ELEMID_SSID:
 				ssid = frm;
 				break;
 			case IEEE80211_ELEMID_RATES:
 				rates = frm;
 				break;
 			case IEEE80211_ELEMID_XRATES:
 				xrates = frm;
 				break;
 			case IEEE80211_ELEMID_RSN:
 				rsn = frm;
 				break;
 			case IEEE80211_ELEMID_HTCAP:
 				htcap = frm;
 				break;
 			case IEEE80211_ELEMID_VHT_CAP:
 				vhtcap = frm;
 				break;
 			case IEEE80211_ELEMID_VHT_OPMODE:
 				vhtinfo = frm;
 				break;
 			case IEEE80211_ELEMID_VENDOR:
 				if (iswpaoui(frm))
 					wpa = frm;
 				else if (iswmeinfo(frm))
 					wme = frm;
 #ifdef IEEE80211_SUPPORT_SUPERG
 				else if (isatherosoui(frm))
 					ath = frm;
 #endif
 				else if (vap->iv_flags_ht & IEEE80211_FHT_HTCOMPAT) {
 					if (ishtcapoui(frm) && htcap == NULL)
 						htcap = frm;
 				}
 				break;
 			}
 			frm += frm[1] + 2;
 		}
 		IEEE80211_VERIFY_ELEMENT(rates, IEEE80211_RATE_MAXSIZE, return);
 		if (xrates != NULL)
 			IEEE80211_VERIFY_ELEMENT(xrates,
 				IEEE80211_RATE_MAXSIZE - rates[1], return);
 		IEEE80211_VERIFY_ELEMENT(ssid, IEEE80211_NWID_LEN, return);
 		IEEE80211_VERIFY_SSID(vap->iv_bss, ssid, return);
 		if (htcap != NULL) {
 			IEEE80211_VERIFY_LENGTH(htcap[1],
 			     htcap[0] == IEEE80211_ELEMID_VENDOR ?
 			         4 + sizeof(struct ieee80211_ie_htcap)-2 :
 			         sizeof(struct ieee80211_ie_htcap)-2,
 			     return);		/* XXX just NULL out? */
 		}
 
 		/* Validate VHT IEs */
 		if (vhtcap != NULL) {
 			IEEE80211_VERIFY_LENGTH(vhtcap[1],
 			    sizeof(struct ieee80211_ie_vhtcap) - 2,
 			    return);
 		}
 		if (vhtinfo != NULL) {
 			IEEE80211_VERIFY_LENGTH(vhtinfo[1],
 			    sizeof(struct ieee80211_ie_vht_operation) - 2,
 			    return);
 		}
 
 		if ((vap->iv_flags & IEEE80211_F_WPA) &&
 		    !wpa_assocreq(ni, &rsnparms, wh, wpa, rsn, capinfo))
 			return;
 		/* discard challenge after association */
 		if (ni->ni_challenge != NULL) {
 			IEEE80211_FREE(ni->ni_challenge, M_80211_NODE);
 			ni->ni_challenge = NULL;
 		}
 		/* NB: 802.11 spec says to ignore station's privacy bit */
 		if ((capinfo & IEEE80211_CAPINFO_ESS) == 0) {
 			capinfomismatch(ni, wh, reassoc, resp,
 			    "capability", capinfo);
 			return;
 		}
 		/*
 		 * Disallow re-associate w/ invalid slot time setting.
 		 */
 		if (ni->ni_associd != 0 &&
 		    IEEE80211_IS_CHAN_ANYG(ic->ic_bsschan) &&
 		    ((ni->ni_capinfo ^ capinfo) & IEEE80211_CAPINFO_SHORT_SLOTTIME)) {
 			capinfomismatch(ni, wh, reassoc, resp,
 			    "slot time", capinfo);
 			return;
 		}
 		rate = ieee80211_setup_rates(ni, rates, xrates,
 				IEEE80211_F_DOSORT | IEEE80211_F_DOFRATE |
 				IEEE80211_F_DONEGO | IEEE80211_F_DODEL);
 		if (rate & IEEE80211_RATE_BASIC) {
 			ratesetmismatch(ni, wh, reassoc, resp, "legacy", rate);
 			vap->iv_stats.is_rx_assoc_norate++;
 			return;
 		}
 		/*
 		 * If constrained to 11g-only stations reject an
 		 * 11b-only station.  We cheat a bit here by looking
 		 * at the max negotiated xmit rate and assuming anyone
 		 * with a best rate <24Mb/s is an 11b station.
 		 */
 		if ((vap->iv_flags & IEEE80211_F_PUREG) && rate < 48) {
 			ratesetmismatch(ni, wh, reassoc, resp, "11g", rate);
 			vap->iv_stats.is_rx_assoc_norate++;
 			return;
 		}
 
 		/*
 		 * Do HT rate set handling and setup HT node state.
 		 */
 		ni->ni_chan = vap->iv_bss->ni_chan;
 
 		/* VHT */
 		if (IEEE80211_IS_CHAN_VHT(ni->ni_chan) &&
 		    vhtcap != NULL &&
 		    vhtinfo != NULL) {
 			/* XXX TODO; see below */
 			printf("%s: VHT TODO!\n", __func__);
 			ieee80211_vht_node_init(ni);
 			ieee80211_vht_update_cap(ni, vhtcap, vhtinfo);
 		} else if (ni->ni_flags & IEEE80211_NODE_VHT)
 			ieee80211_vht_node_cleanup(ni);
 
 		/* HT */
 		if (IEEE80211_IS_CHAN_HT(ni->ni_chan) && htcap != NULL) {
 			rate = ieee80211_setup_htrates(ni, htcap,
 				IEEE80211_F_DOFMCS | IEEE80211_F_DONEGO |
 				IEEE80211_F_DOBRS);
 			if (rate & IEEE80211_RATE_BASIC) {
 				ratesetmismatch(ni, wh, reassoc, resp,
 				    "HT", rate);
 				vap->iv_stats.is_ht_assoc_norate++;
 				return;
 			}
 			ieee80211_ht_node_init(ni);
 			ieee80211_ht_updatehtcap(ni, htcap);
 		} else if (ni->ni_flags & IEEE80211_NODE_HT)
 			ieee80211_ht_node_cleanup(ni);
 
 		/* Finally - this will use HT/VHT info to change node channel */
 		if (IEEE80211_IS_CHAN_HT(ni->ni_chan) && htcap != NULL) {
 			ieee80211_ht_updatehtcap_final(ni);
 		}
 
 #ifdef IEEE80211_SUPPORT_SUPERG
 		/* Always do ff node cleanup; for A-MSDU */
 		ieee80211_ff_node_cleanup(ni);
 #endif
 		/*
 		 * Allow AMPDU operation only with unencrypted traffic
 		 * or AES-CCM; the 11n spec only specifies these ciphers
 		 * so permitting any others is undefined and can lead
 		 * to interoperability problems.
 		 */
 		if ((ni->ni_flags & IEEE80211_NODE_HT) &&
 		    (((vap->iv_flags & IEEE80211_F_WPA) &&
 		      rsnparms.rsn_ucastcipher != IEEE80211_CIPHER_AES_CCM) ||
 		     (vap->iv_flags & (IEEE80211_F_WPA|IEEE80211_F_PRIVACY)) == IEEE80211_F_PRIVACY)) {
 			IEEE80211_NOTE(vap,
 			    IEEE80211_MSG_ASSOC | IEEE80211_MSG_11N, ni,
 			    "disallow HT use because WEP or TKIP requested, "
 			    "capinfo 0x%x ucastcipher %d", capinfo,
 			    rsnparms.rsn_ucastcipher);
 			ieee80211_ht_node_cleanup(ni);
 #ifdef IEEE80211_SUPPORT_SUPERG
 			/* Always do ff node cleanup; for A-MSDU */
 			ieee80211_ff_node_cleanup(ni);
 #endif
 			vap->iv_stats.is_ht_assoc_downgrade++;
 		}
 		/*
 		 * If constrained to 11n-only stations reject legacy stations.
 		 */
 		if ((vap->iv_flags_ht & IEEE80211_FHT_PUREN) &&
 		    (ni->ni_flags & IEEE80211_NODE_HT) == 0) {
 			htcapmismatch(ni, wh, reassoc, resp);
 			vap->iv_stats.is_ht_assoc_nohtcap++;
 			return;
 		}
 		IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi);
 		ni->ni_noise = nf;
 		ni->ni_intval = lintval;
 		ni->ni_capinfo = capinfo;
 		ni->ni_fhdwell = vap->iv_bss->ni_fhdwell;
 		ni->ni_fhindex = vap->iv_bss->ni_fhindex;
 		/*
 		 * Store the IEs.
 		 * XXX maybe better to just expand
 		 */
 		if (ieee80211_ies_init(&ni->ni_ies, sfrm, efrm - sfrm)) {
 #define	setie(_ie, _off)	ieee80211_ies_setie(ni->ni_ies, _ie, _off)
 			if (wpa != NULL)
 				setie(wpa_ie, wpa - sfrm);
 			if (rsn != NULL)
 				setie(rsn_ie, rsn - sfrm);
 			if (htcap != NULL)
 				setie(htcap_ie, htcap - sfrm);
 			if (wme != NULL) {
 				setie(wme_ie, wme - sfrm);
 				/*
 				 * Mark node as capable of QoS.
 				 */
 				ni->ni_flags |= IEEE80211_NODE_QOS;
 				if (ieee80211_parse_wmeie(wme, wh, ni) > 0) {
 					if (ni->ni_uapsd != 0)
 						ni->ni_flags |=
 						    IEEE80211_NODE_UAPSD;
 					else
 						ni->ni_flags &=
 						    ~IEEE80211_NODE_UAPSD;
 				}
 			} else
 				ni->ni_flags &=
 				    ~(IEEE80211_NODE_QOS |
 				      IEEE80211_NODE_UAPSD);
 #ifdef IEEE80211_SUPPORT_SUPERG
 			if (ath != NULL) {
 				setie(ath_ie, ath - sfrm);
 				/* 
 				 * Parse ATH station parameters.
 				 */
 				ieee80211_parse_ath(ni, ni->ni_ies.ath_ie);
 			} else
 #endif
 				ni->ni_ath_flags = 0;
 #undef setie
 		} else {
 			ni->ni_flags &= ~IEEE80211_NODE_QOS;
 			ni->ni_flags &= ~IEEE80211_NODE_UAPSD;
 			ni->ni_ath_flags = 0;
 		}
 		ieee80211_node_join(ni, resp);
 		ieee80211_deliver_l2uf(ni);
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_DEAUTH:
 	case IEEE80211_FC0_SUBTYPE_DISASSOC: {
 #ifdef IEEE80211_DEBUG
 		uint16_t reason;
 #endif
 
 		if (vap->iv_state != IEEE80211_S_RUN ||
 		    /* NB: can happen when in promiscuous mode */
 		    !IEEE80211_ADDR_EQ(wh->i_addr1, vap->iv_myaddr)) {
 			vap->iv_stats.is_rx_mgtdiscard++;
 			break;
 		}
 		/*
 		 * deauth/disassoc frame format
 		 *	[2] reason
 		 */
 		IEEE80211_VERIFY_LENGTH(efrm - frm, 2, return);
 #ifdef IEEE80211_DEBUG
 		reason = le16toh(*(uint16_t *)frm);
 #endif
 		if (subtype == IEEE80211_FC0_SUBTYPE_DEAUTH) {
 			vap->iv_stats.is_rx_deauth++;
 			IEEE80211_NODE_STAT(ni, rx_deauth);
 		} else {
 			vap->iv_stats.is_rx_disassoc++;
 			IEEE80211_NODE_STAT(ni, rx_disassoc);
 		}
 		IEEE80211_NOTE(vap, IEEE80211_MSG_AUTH, ni,
 		    "recv %s (reason: %d (%s))",
 		    ieee80211_mgt_subtype_name(subtype),
 		    reason, ieee80211_reason_to_string(reason));
 		if (ni != vap->iv_bss)
 			ieee80211_node_leave(ni);
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_ACTION:
 	case IEEE80211_FC0_SUBTYPE_ACTION_NOACK:
 		if (ni == vap->iv_bss) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "unknown node");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (!IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr1) &&
 		    !IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "not for us");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (vap->iv_state != IEEE80211_S_RUN) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "wrong state %s",
 			    ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else {
 			if (ieee80211_parse_action(ni, m0) == 0)
 				(void)ic->ic_recv_action(ni, wh, frm, efrm);
 		}
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_TIMING_ADV:
 	case IEEE80211_FC0_SUBTYPE_ATIM:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 		    wh, NULL, "%s", "not handled");
 		vap->iv_stats.is_rx_mgtdiscard++;
 		break;
 
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "mgt", "subtype 0x%x not handled", subtype);
 		vap->iv_stats.is_rx_badsubtype++;
 		break;
 	}
 }
 
 static void
 hostap_recv_ctl(struct ieee80211_node *ni, struct mbuf *m, int subtype)
 {
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_PS_POLL:
 		ni->ni_vap->iv_recv_pspoll(ni, m);
 		break;
 	case IEEE80211_FC0_SUBTYPE_BAR:
 		ieee80211_recv_bar(ni, m);
 		break;
 	}
 }
 
 /*
  * Process a received ps-poll frame.
  */
 void
 ieee80211_recv_pspoll(struct ieee80211_node *ni, struct mbuf *m0)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_frame_min *wh;
 	struct mbuf *m;
 	uint16_t aid;
 	int qlen;
 
 	wh = mtod(m0, struct ieee80211_frame_min *);
 	if (ni->ni_associd == 0) {
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_POWER | IEEE80211_MSG_DEBUG,
 		    (struct ieee80211_frame *) wh, NULL,
 		    "%s", "unassociated station");
 		vap->iv_stats.is_ps_unassoc++;
 		IEEE80211_SEND_MGMT(ni, IEEE80211_FC0_SUBTYPE_DEAUTH,
 			IEEE80211_REASON_NOT_ASSOCED);
 		return;
 	}
 
 	aid = le16toh(*(uint16_t *)wh->i_dur);
 	if (aid != ni->ni_associd) {
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_POWER | IEEE80211_MSG_DEBUG,
 		    (struct ieee80211_frame *) wh, NULL,
 		    "aid mismatch: sta aid 0x%x poll aid 0x%x",
 		    ni->ni_associd, aid);
 		vap->iv_stats.is_ps_badaid++;
 		/*
 		 * NB: We used to deauth the station but it turns out
 		 * the Blackberry Curve 8230 (and perhaps other devices) 
 		 * sometimes send the wrong AID when WME is negotiated.
 		 * Being more lenient here seems ok as we already check
 		 * the station is associated and we only return frames
 		 * queued for the station (i.e. we don't use the AID).
 		 */
 		return;
 	}
 
 	/* Okay, take the first queued packet and put it out... */
 	m = ieee80211_node_psq_dequeue(ni, &qlen);
 	if (m == NULL) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_POWER, wh->i_addr2,
 		    "%s", "recv ps-poll, but queue empty");
 		ieee80211_send_nulldata(ieee80211_ref_node(ni));
 		vap->iv_stats.is_ps_qempty++;	/* XXX node stat */
 		if (vap->iv_set_tim != NULL)
 			vap->iv_set_tim(ni, 0);	/* just in case */
 		return;
 	}
 	/* 
 	 * If there are more packets, set the more packets bit
 	 * in the packet dispatched to the station; otherwise
 	 * turn off the TIM bit.
 	 */
 	if (qlen != 0) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_POWER, ni,
 		    "recv ps-poll, send packet, %u still queued", qlen);
 		m->m_flags |= M_MORE_DATA;
 	} else {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_POWER, ni,
 		    "%s", "recv ps-poll, send packet, queue empty");
 		if (vap->iv_set_tim != NULL)
 			vap->iv_set_tim(ni, 0);
 	}
 	m->m_flags |= M_PWR_SAV;		/* bypass PS handling */
 
 	/*
 	 * Do the right thing; if it's an encap'ed frame then
 	 * call ieee80211_parent_xmitpkt() else
 	 * call ieee80211_vap_xmitpkt().
 	 */
 	if (m->m_flags & M_ENCAP) {
 		(void) ieee80211_parent_xmitpkt(ic, m);
 	} else {
 		(void) ieee80211_vap_xmitpkt(vap, m);
 	}
 }
diff --git a/sys/net80211/ieee80211_input.c b/sys/net80211/ieee80211_input.c
index b7d9c6f9457e..78483a3ad2e1 100644
--- a/sys/net80211/ieee80211_input.c
+++ b/sys/net80211/ieee80211_input.c
@@ -1,1059 +1,1060 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001 Atsushi Onoe
  * Copyright (c) 2002-2009 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>   
 #include <sys/malloc.h>
 #include <sys/endian.h>
 #include <sys/kernel.h>
 
 #include <sys/socket.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llc.h>
 #include <net/if_media.h>
+#include <net/if_private.h>
 #include <net/if_vlan_var.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_input.h>
 #ifdef IEEE80211_SUPPORT_MESH
 #include <net80211/ieee80211_mesh.h>
 #endif
 
 #include <net/bpf.h>
 
 #ifdef INET
 #include <netinet/in.h>
 #include <net/ethernet.h>
 #endif
 
 static void
 ieee80211_process_mimo(struct ieee80211_node *ni, struct ieee80211_rx_stats *rx)
 {
 	int i;
 
 	/* Verify the required MIMO bits are set */
 	if ((rx->r_flags & (IEEE80211_R_C_CHAIN | IEEE80211_R_C_NF | IEEE80211_R_C_RSSI)) !=
 	    (IEEE80211_R_C_CHAIN | IEEE80211_R_C_NF | IEEE80211_R_C_RSSI))
 		return;
 
 	/* XXX This assumes the MIMO radios have both ctl and ext chains */
 	for (i = 0; i < MIN(rx->c_chain, IEEE80211_MAX_CHAINS); i++) {
 		IEEE80211_RSSI_LPF(ni->ni_mimo_rssi_ctl[i], rx->c_rssi_ctl[i]);
 		IEEE80211_RSSI_LPF(ni->ni_mimo_rssi_ext[i], rx->c_rssi_ext[i]);
 	}
 
 	/* XXX This also assumes the MIMO radios have both ctl and ext chains */
 	for(i = 0; i < MIN(rx->c_chain, IEEE80211_MAX_CHAINS); i++) {
 		ni->ni_mimo_noise_ctl[i] = rx->c_nf_ctl[i];
 		ni->ni_mimo_noise_ext[i] = rx->c_nf_ext[i];
 	}
 	ni->ni_mimo_chains = rx->c_chain;
 }
 
 int
 ieee80211_input_mimo(struct ieee80211_node *ni, struct mbuf *m)
 {
 	struct ieee80211_rx_stats rxs;
 
 	/* try to read stats from mbuf */
 	bzero(&rxs, sizeof(rxs));
 	if (ieee80211_get_rx_params(m, &rxs) != 0)
 		return (-1);
 
 	/* XXX should assert IEEE80211_R_NF and IEEE80211_R_RSSI are set */
 	ieee80211_process_mimo(ni, &rxs);
 
 	//return ieee80211_input(ni, m, rx->rssi, rx->nf);
 	return ni->ni_vap->iv_input(ni, m, &rxs, rxs.c_rssi, rxs.c_nf);
 }
 
 int
 ieee80211_input_all(struct ieee80211com *ic, struct mbuf *m, int rssi, int nf)
 {
 	struct ieee80211_rx_stats rx;
 
 	rx.r_flags = IEEE80211_R_NF | IEEE80211_R_RSSI;
 	rx.c_nf = nf;
 	rx.c_rssi = rssi;
 
 	if (!ieee80211_add_rx_params(m, &rx))
 		return (-1);
 
 	return ieee80211_input_mimo_all(ic, m);
 }
 
 int
 ieee80211_input_mimo_all(struct ieee80211com *ic, struct mbuf *m)
 {
 	struct ieee80211vap *vap;
 	int type = -1;
 
 	m->m_flags |= M_BCAST;		/* NB: mark for bpf tap'ing */
 
 	/* XXX locking */
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		struct ieee80211_node *ni;
 		struct mbuf *mcopy;
 
 		/* NB: could check for IFF_UP but this is cheaper */
 		if (vap->iv_state == IEEE80211_S_INIT)
 			continue;
 		/*
 		 * WDS vap's only receive directed traffic from the
 		 * station at the ``far end''.  That traffic should
 		 * be passed through the AP vap the station is associated
 		 * to--so don't spam them with mcast frames.
 		 */
 		if (vap->iv_opmode == IEEE80211_M_WDS)
 			continue;
 		if (TAILQ_NEXT(vap, iv_next) != NULL) {
 			/*
 			 * Packet contents are changed by ieee80211_decap
 			 * so do a deep copy of the packet.
 			 * NB: tags are copied too.
 			 */
 			mcopy = m_dup(m, IEEE80211_M_NOWAIT);
 			if (mcopy == NULL) {
 				/* XXX stat+msg */
 				continue;
 			}
 		} else {
 			mcopy = m;
 			m = NULL;
 		}
 		ni = ieee80211_ref_node(vap->iv_bss);
 		type = ieee80211_input_mimo(ni, mcopy);
 		ieee80211_free_node(ni);
 	}
 	if (m != NULL)			/* no vaps, reclaim mbuf */
 		m_freem(m);
 	return type;
 }
 
 /*
  * This function reassembles fragments.
  *
  * XXX should handle 3 concurrent reassemblies per-spec.
  */
 struct mbuf *
 ieee80211_defrag(struct ieee80211_node *ni, struct mbuf *m, int hdrspace,
 	int has_decrypted)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_frame *wh = mtod(m, struct ieee80211_frame *);
 	struct ieee80211_frame *lwh;
 	uint16_t rxseq;
 	uint8_t fragno;
 	uint8_t more_frag = wh->i_fc[1] & IEEE80211_FC1_MORE_FRAG;
 	struct mbuf *mfrag;
 
 	KASSERT(!IEEE80211_IS_MULTICAST(wh->i_addr1), ("multicast fragm?"));
 
 	rxseq = le16toh(*(uint16_t *)wh->i_seq);
 	fragno = rxseq & IEEE80211_SEQ_FRAG_MASK;
 
 	/* Quick way out, if there's nothing to defragment */
 	if (!more_frag && fragno == 0 && ni->ni_rxfrag[0] == NULL)
 		return m;
 
 	/* Temporarily set flag to remember if fragment was encrypted. */
 	/* XXX use a non-packet altering storage for this in the future. */
 	if (has_decrypted)
 		wh->i_fc[1] |= IEEE80211_FC1_PROTECTED;
 
 	/*
 	 * Remove frag to insure it doesn't get reaped by timer.
 	 */
 	if (ni->ni_table == NULL) {
 		/*
 		 * Should never happen.  If the node is orphaned (not in
 		 * the table) then input packets should not reach here.
 		 * Otherwise, a concurrent request that yanks the table
 		 * should be blocked by other interlocking and/or by first
 		 * shutting the driver down.  Regardless, be defensive
 		 * here and just bail
 		 */
 		/* XXX need msg+stat */
 		m_freem(m);
 		return NULL;
 	}
 	IEEE80211_NODE_LOCK(ni->ni_table);
 	mfrag = ni->ni_rxfrag[0];
 	ni->ni_rxfrag[0] = NULL;
 	IEEE80211_NODE_UNLOCK(ni->ni_table);
 
 	/*
 	 * Validate new fragment is in order and
 	 * related to the previous ones.
 	 */
 	if (mfrag != NULL) {
 		uint16_t last_rxseq;
 
 		lwh = mtod(mfrag, struct ieee80211_frame *);
 		last_rxseq = le16toh(*(uint16_t *)lwh->i_seq);
 		/*
 		 * NB: check seq # and frag together. Also check that both
 		 * fragments are plaintext or that both are encrypted.
 		 */
 		if (rxseq == last_rxseq+1 &&
 		    IEEE80211_ADDR_EQ(wh->i_addr1, lwh->i_addr1) &&
 		    IEEE80211_ADDR_EQ(wh->i_addr2, lwh->i_addr2) &&
 		    !((wh->i_fc[1] ^ lwh->i_fc[1]) & IEEE80211_FC1_PROTECTED)) {
 			/* XXX clear MORE_FRAG bit? */
 			/* track last seqnum and fragno */
 			*(uint16_t *) lwh->i_seq = *(uint16_t *) wh->i_seq;
 
 			m_adj(m, hdrspace);		/* strip header */
 			m_catpkt(mfrag, m);		/* concatenate */
 		} else {
 			/*
 			 * Unrelated fragment or no space for it,
 			 * clear current fragments.
 			 */
 			m_freem(mfrag);
 			mfrag = NULL;
 		}
 	}
 
  	if (mfrag == NULL) {
 		if (fragno != 0) {		/* !first fragment, discard */
 			vap->iv_stats.is_rx_defrag++;
 			IEEE80211_NODE_STAT(ni, rx_defrag);
 			m_freem(m);
 			return NULL;
 		}
 		mfrag = m;
 	}
 	if (more_frag) {			/* more to come, save */
 		ni->ni_rxfragstamp = ticks;
 		ni->ni_rxfrag[0] = mfrag;
 		mfrag = NULL;
 	}
 	/* Remember to clear protected flag that was temporarily set. */
 	if (mfrag != NULL) {
 		wh = mtod(mfrag, struct ieee80211_frame *);
 		wh->i_fc[1] &= ~IEEE80211_FC1_PROTECTED;
 	}
 	return mfrag;
 }
 
 void
 ieee80211_deliver_data(struct ieee80211vap *vap,
 	struct ieee80211_node *ni, struct mbuf *m)
 {
 	struct ether_header *eh = mtod(m, struct ether_header *);
 	struct ifnet *ifp = vap->iv_ifp;
 
 	/* clear driver/net80211 flags before passing up */
 	m->m_flags &= ~(M_MCAST | M_BCAST);
 	m_clrprotoflags(m);
 
 	/* NB: see hostap_deliver_data, this path doesn't handle hostap */
 	KASSERT(vap->iv_opmode != IEEE80211_M_HOSTAP, ("gack, hostap"));
 	/*
 	 * Do accounting.
 	 */
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	IEEE80211_NODE_STAT(ni, rx_data);
 	IEEE80211_NODE_STAT_ADD(ni, rx_bytes, m->m_pkthdr.len);
 	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 		if (ETHER_IS_BROADCAST(eh->ether_dhost))
 			m->m_flags |= M_BCAST;
 		else
 			m->m_flags |= M_MCAST;
 		IEEE80211_NODE_STAT(ni, rx_mcast);
 	} else
 		IEEE80211_NODE_STAT(ni, rx_ucast);
 	m->m_pkthdr.rcvif = ifp;
 
 	if (ni->ni_vlan != 0) {
 		/* attach vlan tag */
 		m->m_pkthdr.ether_vtag = ni->ni_vlan;
 		m->m_flags |= M_VLANTAG;
 	}
 	ifp->if_input(ifp, m);
 }
 
 struct mbuf *
 ieee80211_decap(struct ieee80211vap *vap, struct mbuf *m, int hdrlen,
 	uint8_t qos)
 {
 	struct ieee80211_qosframe_addr4 wh;
 	struct ether_header *eh;
 	struct llc *llc;
 
 	KASSERT(hdrlen <= sizeof(wh),
 	    ("hdrlen %d > max %zd", hdrlen, sizeof(wh)));
 
 	if (m->m_len < hdrlen + sizeof(*llc) &&
 	    (m = m_pullup(m, hdrlen + sizeof(*llc))) == NULL) {
 		vap->iv_stats.is_rx_tooshort++;
 		/* XXX msg */
 		return NULL;
 	}
 	memcpy(&wh, mtod(m, caddr_t), hdrlen);
 	llc = (struct llc *)(mtod(m, caddr_t) + hdrlen);
 	if (llc->llc_dsap == LLC_SNAP_LSAP && llc->llc_ssap == LLC_SNAP_LSAP &&
 	    llc->llc_control == LLC_UI && llc->llc_snap.org_code[0] == 0 &&
 	    llc->llc_snap.org_code[1] == 0 && llc->llc_snap.org_code[2] == 0 &&
 	    /* NB: preserve AppleTalk frames that have a native SNAP hdr */
 	    !(llc->llc_snap.ether_type == htons(ETHERTYPE_AARP) ||
 	      llc->llc_snap.ether_type == htons(ETHERTYPE_IPX)) &&
 	    /* Do not want to touch A-MSDU frames. */
 	    !(qos & IEEE80211_QOS_AMSDU)) {
 		m_adj(m, hdrlen + sizeof(struct llc) - sizeof(*eh));
 		llc = NULL;
 	} else {
 		m_adj(m, hdrlen - sizeof(*eh));
 	}
 	eh = mtod(m, struct ether_header *);
 	switch (wh.i_fc[1] & IEEE80211_FC1_DIR_MASK) {
 	case IEEE80211_FC1_DIR_NODS:
 		IEEE80211_ADDR_COPY(eh->ether_dhost, wh.i_addr1);
 		IEEE80211_ADDR_COPY(eh->ether_shost, wh.i_addr2);
 		break;
 	case IEEE80211_FC1_DIR_TODS:
 		IEEE80211_ADDR_COPY(eh->ether_dhost, wh.i_addr3);
 		IEEE80211_ADDR_COPY(eh->ether_shost, wh.i_addr2);
 		break;
 	case IEEE80211_FC1_DIR_FROMDS:
 		IEEE80211_ADDR_COPY(eh->ether_dhost, wh.i_addr1);
 		IEEE80211_ADDR_COPY(eh->ether_shost, wh.i_addr3);
 		break;
 	case IEEE80211_FC1_DIR_DSTODS:
 		IEEE80211_ADDR_COPY(eh->ether_dhost, wh.i_addr3);
 		IEEE80211_ADDR_COPY(eh->ether_shost, wh.i_addr4);
 		break;
 	}
 #ifndef __NO_STRICT_ALIGNMENT
 	if (!ALIGNED_POINTER(mtod(m, caddr_t) + sizeof(*eh), uint32_t)) {
 		m = ieee80211_realign(vap, m, sizeof(*eh));
 		if (m == NULL)
 			return NULL;
 	}
 #endif /* !__NO_STRICT_ALIGNMENT */
 	if (llc != NULL) {
 		eh = mtod(m, struct ether_header *);
 		eh->ether_type = htons(m->m_pkthdr.len - sizeof(*eh));
 	}
 	return m;
 }
 
 /*
  * Decap a frame encapsulated in a fast-frame/A-MSDU.
  */
 struct mbuf *
 ieee80211_decap1(struct mbuf *m, int *framelen)
 {
 #define	FF_LLC_SIZE	(sizeof(struct ether_header) + sizeof(struct llc))
 	struct ether_header *eh;
 	struct llc *llc;
 	const uint8_t llc_hdr_mac[ETHER_ADDR_LEN] = {
 		/* MAC address matching the 802.2 LLC header */
 		LLC_SNAP_LSAP, LLC_SNAP_LSAP, LLC_UI, 0, 0, 0
 	};
 
 	/*
 	 * The frame has an 802.3 header followed by an 802.2
 	 * LLC header.  The encapsulated frame length is in the
 	 * first header type field; save that and overwrite it 
 	 * with the true type field found in the second.  Then
 	 * copy the 802.3 header up to where it belongs and
 	 * adjust the mbuf contents to remove the void.
 	 */
 	if (m->m_len < FF_LLC_SIZE && (m = m_pullup(m, FF_LLC_SIZE)) == NULL)
 		return NULL;
 	eh = mtod(m, struct ether_header *);	/* 802.3 header is first */
 
 	/*
 	 * Detect possible attack where a single 802.11 frame is processed
 	 * as an A-MSDU frame due to an adversary setting the A-MSDU present
 	 * bit in the 802.11 QoS header. [FragAttacks]
 	 */
 	if (memcmp(eh->ether_dhost, llc_hdr_mac, ETHER_ADDR_LEN) == 0)
 		return NULL;
 
 	llc = (struct llc *)&eh[1];		/* 802.2 header follows */
 	*framelen = ntohs(eh->ether_type)	/* encap'd frame size */
 		  + sizeof(struct ether_header) - sizeof(struct llc);
 	eh->ether_type = llc->llc_un.type_snap.ether_type;
 	ovbcopy(eh, mtod(m, uint8_t *) + sizeof(struct llc),
 		sizeof(struct ether_header));
 	m_adj(m, sizeof(struct llc));
 	return m;
 #undef FF_LLC_SIZE
 }
 
 /*
  * Install received rate set information in the node's state block.
  */
 int
 ieee80211_setup_rates(struct ieee80211_node *ni,
 	const uint8_t *rates, const uint8_t *xrates, int flags)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_rateset *rs = &ni->ni_rates;
 
 	memset(rs, 0, sizeof(*rs));
 	rs->rs_nrates = rates[1];
 	memcpy(rs->rs_rates, rates + 2, rs->rs_nrates);
 	if (xrates != NULL) {
 		uint8_t nxrates;
 		/*
 		 * Tack on 11g extended supported rate element.
 		 */
 		nxrates = xrates[1];
 		if (rs->rs_nrates + nxrates > IEEE80211_RATE_MAXSIZE) {
 			nxrates = IEEE80211_RATE_MAXSIZE - rs->rs_nrates;
 			IEEE80211_NOTE(vap, IEEE80211_MSG_XRATE, ni,
 			    "extended rate set too large; only using "
 			    "%u of %u rates", nxrates, xrates[1]);
 			vap->iv_stats.is_rx_rstoobig++;
 		}
 		memcpy(rs->rs_rates + rs->rs_nrates, xrates+2, nxrates);
 		rs->rs_nrates += nxrates;
 	}
 	return ieee80211_fix_rate(ni, rs, flags);
 }
 
 /*
  * Send a management frame error response to the specified
  * station.  If ni is associated with the station then use
  * it; otherwise allocate a temporary node suitable for
  * transmitting the frame and then free the reference so
  * it will go away as soon as the frame has been transmitted.
  */
 void
 ieee80211_send_error(struct ieee80211_node *ni,
 	const uint8_t mac[IEEE80211_ADDR_LEN], int subtype, int arg)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	int istmp;
 
 	if (ni == vap->iv_bss) {
 		if (vap->iv_state != IEEE80211_S_RUN) {
 			/*
 			 * XXX hack until we get rid of this routine.
 			 * We can be called prior to the vap reaching
 			 * run state under certain conditions in which
 			 * case iv_bss->ni_chan will not be setup.
 			 * Check for this explicitly and and just ignore
 			 * the request.
 			 */
 			return;
 		}
 		ni = ieee80211_tmp_node(vap, mac);
 		if (ni == NULL) {
 			/* XXX msg */
 			return;
 		}
 		istmp = 1;
 	} else
 		istmp = 0;
 	IEEE80211_SEND_MGMT(ni, subtype, arg);
 	if (istmp)
 		ieee80211_free_node(ni);
 }
 
 int
 ieee80211_alloc_challenge(struct ieee80211_node *ni)
 {
 	if (ni->ni_challenge == NULL)
 		ni->ni_challenge = (uint32_t *)
 		    IEEE80211_MALLOC(IEEE80211_CHALLENGE_LEN,
 		      M_80211_NODE, IEEE80211_M_NOWAIT);
 	if (ni->ni_challenge == NULL) {
 		IEEE80211_NOTE(ni->ni_vap,
 		    IEEE80211_MSG_DEBUG | IEEE80211_MSG_AUTH, ni,
 		    "%s", "shared key challenge alloc failed");
 		/* XXX statistic */
 	}
 	return (ni->ni_challenge != NULL);
 }
 
 /*
  * Parse a Beacon or ProbeResponse frame and return the
  * useful information in an ieee80211_scanparams structure.
  * Status is set to 0 if no problems were found; otherwise
  * a bitmask of IEEE80211_BPARSE_* items is returned that
  * describes the problems detected.
  */
 int
 ieee80211_parse_beacon(struct ieee80211_node *ni, struct mbuf *m,
 	struct ieee80211_channel *rxchan, struct ieee80211_scanparams *scan)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_frame *wh;
 	uint8_t *frm, *efrm;
 
 	wh = mtod(m, struct ieee80211_frame *);
 	frm = (uint8_t *)&wh[1];
 	efrm = mtod(m, uint8_t *) + m->m_len;
 	scan->status = 0;
 	/*
 	 * beacon/probe response frame format
 	 *
 	 * XXX Update from 802.11-2012 - eg where HT is
 	 *	[8] time stamp
 	 *	[2] beacon interval
 	 *	[2] capability information
 	 *	[tlv] ssid
 	 *	[tlv] supported rates
 	 *	[tlv] country information
 	 *	[tlv] channel switch announcement (CSA)
 	 *	[tlv] parameter set (FH/DS)
 	 *	[tlv] erp information
 	 *	[tlv] extended supported rates
 	 *	[tlv] WME
 	 *	[tlv] WPA or RSN
 	 *	[tlv] HT capabilities
 	 *	[tlv] HT information
 	 *	[tlv] VHT capabilities
 	 *	[tlv] VHT information
 	 *	[tlv] Atheros capabilities
 	 *	[tlv] Mesh ID
 	 *	[tlv] Mesh Configuration
 	 */
 	IEEE80211_VERIFY_LENGTH(efrm - frm, 12,
 	    return (scan->status = IEEE80211_BPARSE_BADIELEN));
 	memset(scan, 0, sizeof(*scan));
 	scan->tstamp  = frm;				frm += 8;
 	scan->bintval = le16toh(*(uint16_t *)frm);	frm += 2;
 	scan->capinfo = le16toh(*(uint16_t *)frm);	frm += 2;
 	scan->bchan = ieee80211_chan2ieee(ic, rxchan);
 	scan->chan = scan->bchan;
 	scan->ies = frm;
 	scan->ies_len = efrm - frm;
 
 	while (efrm - frm > 1) {
 		IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2,
 		    return (scan->status = IEEE80211_BPARSE_BADIELEN));
 		switch (*frm) {
 		case IEEE80211_ELEMID_SSID:
 			scan->ssid = frm;
 			break;
 		case IEEE80211_ELEMID_RATES:
 			scan->rates = frm;
 			break;
 		case IEEE80211_ELEMID_COUNTRY:
 			scan->country = frm;
 			break;
 		case IEEE80211_ELEMID_CSA:
 			scan->csa = frm;
 			break;
 		case IEEE80211_ELEMID_QUIET:
 			scan->quiet = frm;
 			break;
 		case IEEE80211_ELEMID_FHPARMS:
 			if (ic->ic_phytype == IEEE80211_T_FH) {
 				scan->fhdwell = le16dec(&frm[2]);
 				scan->chan = IEEE80211_FH_CHAN(frm[4], frm[5]);
 				scan->fhindex = frm[6];
 			}
 			break;
 		case IEEE80211_ELEMID_DSPARMS:
 			/*
 			 * XXX hack this since depending on phytype
 			 * is problematic for multi-mode devices.
 			 */
 			if (ic->ic_phytype != IEEE80211_T_FH)
 				scan->chan = frm[2];
 			break;
 		case IEEE80211_ELEMID_TIM:
 			/* XXX ATIM? */
 			scan->tim = frm;
 			scan->timoff = frm - mtod(m, uint8_t *);
 			break;
 		case IEEE80211_ELEMID_IBSSPARMS:
 		case IEEE80211_ELEMID_CFPARMS:
 		case IEEE80211_ELEMID_PWRCNSTR:
 		case IEEE80211_ELEMID_BSSLOAD:
 		case IEEE80211_ELEMID_APCHANREP:
 			/* NB: avoid debugging complaints */
 			break;
 		case IEEE80211_ELEMID_XRATES:
 			scan->xrates = frm;
 			break;
 		case IEEE80211_ELEMID_ERP:
 			if (frm[1] != 1) {
 				IEEE80211_DISCARD_IE(vap,
 				    IEEE80211_MSG_ELEMID, wh, "ERP",
 				    "bad len %u", frm[1]);
 				vap->iv_stats.is_rx_elem_toobig++;
 				break;
 			}
 			scan->erp = frm[2] | 0x100;
 			break;
 		case IEEE80211_ELEMID_HTCAP:
 			scan->htcap = frm;
 			break;
 		case IEEE80211_ELEMID_VHT_CAP:
 			scan->vhtcap = frm;
 			break;
 		case IEEE80211_ELEMID_VHT_OPMODE:
 			scan->vhtopmode = frm;
 			break;
 		case IEEE80211_ELEMID_RSN:
 			scan->rsn = frm;
 			break;
 		case IEEE80211_ELEMID_HTINFO:
 			scan->htinfo = frm;
 			break;
 #ifdef IEEE80211_SUPPORT_MESH
 		case IEEE80211_ELEMID_MESHID:
 			scan->meshid = frm;
 			break;
 		case IEEE80211_ELEMID_MESHCONF:
 			scan->meshconf = frm;
 			break;
 #endif
 		/* Extended capabilities; nothing handles it for now */
 		case IEEE80211_ELEMID_EXTCAP:
 			break;
 		case IEEE80211_ELEMID_VENDOR:
 			if (iswpaoui(frm))
 				scan->wpa = frm;
 			else if (iswmeparam(frm) || iswmeinfo(frm))
 				scan->wme = frm;
 #ifdef IEEE80211_SUPPORT_SUPERG
 			else if (isatherosoui(frm))
 				scan->ath = frm;
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 			else if (istdmaoui(frm))
 				scan->tdma = frm;
 #endif
 			else if (vap->iv_flags_ht & IEEE80211_FHT_HTCOMPAT) {
 				/*
 				 * Accept pre-draft HT ie's if the
 				 * standard ones have not been seen.
 				 */
 				if (ishtcapoui(frm)) {
 					if (scan->htcap == NULL)
 						scan->htcap = frm;
 				} else if (ishtinfooui(frm)) {
 					if (scan->htinfo == NULL)
 						scan->htcap = frm;
 				}
 			}
 			break;
 		default:
 			IEEE80211_DISCARD_IE(vap, IEEE80211_MSG_ELEMID,
 			    wh, "unhandled",
 			    "id %u, len %u", *frm, frm[1]);
 			vap->iv_stats.is_rx_elem_unknown++;
 			break;
 		}
 		frm += frm[1] + 2;
 	}
 	IEEE80211_VERIFY_ELEMENT(scan->rates, IEEE80211_RATE_MAXSIZE,
 	    scan->status |= IEEE80211_BPARSE_RATES_INVALID);
 	if (scan->rates != NULL && scan->xrates != NULL) {
 		/*
 		 * NB: don't process XRATES if RATES is missing.  This
 		 * avoids a potential null ptr deref and should be ok
 		 * as the return code will already note RATES is missing
 		 * (so callers shouldn't otherwise process the frame).
 		 */
 		IEEE80211_VERIFY_ELEMENT(scan->xrates,
 		    IEEE80211_RATE_MAXSIZE - scan->rates[1],
 		    scan->status |= IEEE80211_BPARSE_XRATES_INVALID);
 	}
 	IEEE80211_VERIFY_ELEMENT(scan->ssid, IEEE80211_NWID_LEN,
 	    scan->status |= IEEE80211_BPARSE_SSID_INVALID);
 	if (scan->chan != scan->bchan && ic->ic_phytype != IEEE80211_T_FH) {
 		/*
 		 * Frame was received on a channel different from the
 		 * one indicated in the DS params element id;
 		 * silently discard it.
 		 *
 		 * NB: this can happen due to signal leakage.
 		 *     But we should take it for FH phy because
 		 *     the rssi value should be correct even for
 		 *     different hop pattern in FH.
 		 */
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_INPUT,
 		    wh, NULL, "for off-channel %u (bchan=%u)",
 		    scan->chan, scan->bchan);
 		vap->iv_stats.is_rx_chanmismatch++;
 		scan->status |= IEEE80211_BPARSE_OFFCHAN;
 	}
 	if (!(IEEE80211_BINTVAL_MIN <= scan->bintval &&
 	      scan->bintval <= IEEE80211_BINTVAL_MAX)) {
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_INPUT,
 		    wh, NULL, "bogus beacon interval (%d TU)",
 		    (int) scan->bintval);
 		vap->iv_stats.is_rx_badbintval++;
 		scan->status |= IEEE80211_BPARSE_BINTVAL_INVALID;
 	}
 	if (scan->country != NULL) {
 		/*
 		 * Validate we have at least enough data to extract
 		 * the country code.  Not sure if we should return an
 		 * error instead of discarding the IE; consider this
 		 * being lenient as we don't depend on the data for
 		 * correct operation.
 		 */
 		IEEE80211_VERIFY_LENGTH(scan->country[1], 3 * sizeof(uint8_t),
 		    scan->country = NULL);
 	}
 	if (scan->csa != NULL) {
 		/*
 		 * Validate Channel Switch Announcement; this must
 		 * be the correct length or we toss the frame.
 		 */
 		IEEE80211_VERIFY_LENGTH(scan->csa[1], 3 * sizeof(uint8_t),
 		    scan->status |= IEEE80211_BPARSE_CSA_INVALID);
 	}
 #ifdef IEEE80211_SUPPORT_MESH
 	if (scan->meshid != NULL) {
 		IEEE80211_VERIFY_ELEMENT(scan->meshid, IEEE80211_MESHID_LEN,
 		    scan->status |= IEEE80211_BPARSE_MESHID_INVALID);
 	}
 #endif
 	/*
 	 * Process HT ie's.  This is complicated by our
 	 * accepting both the standard ie's and the pre-draft
 	 * vendor OUI ie's that some vendors still use/require.
 	 */
 	if (scan->htcap != NULL) {
 		IEEE80211_VERIFY_LENGTH(scan->htcap[1],
 		     scan->htcap[0] == IEEE80211_ELEMID_VENDOR ?
 			 4 + sizeof(struct ieee80211_ie_htcap)-2 :
 			 sizeof(struct ieee80211_ie_htcap)-2,
 		     scan->htcap = NULL);
 	}
 	if (scan->htinfo != NULL) {
 		IEEE80211_VERIFY_LENGTH(scan->htinfo[1],
 		     scan->htinfo[0] == IEEE80211_ELEMID_VENDOR ?
 			 4 + sizeof(struct ieee80211_ie_htinfo)-2 :
 			 sizeof(struct ieee80211_ie_htinfo)-2,
 		     scan->htinfo = NULL);
 	}
 
 	/* Process VHT IEs */
 	if (scan->vhtcap != NULL) {
 		IEEE80211_VERIFY_LENGTH(scan->vhtcap[1],
 		    sizeof(struct ieee80211_ie_vhtcap) - 2,
 		    scan->vhtcap = NULL);
 	}
 	if (scan->vhtopmode != NULL) {
 		IEEE80211_VERIFY_LENGTH(scan->vhtopmode[1],
 		    sizeof(struct ieee80211_ie_vht_operation) - 2,
 		    scan->vhtopmode = NULL);
 	}
 
 	return scan->status;
 }
 
 /*
  * Parse an Action frame.  Return 0 on success, non-zero on failure.
  */
 int
 ieee80211_parse_action(struct ieee80211_node *ni, struct mbuf *m)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	const struct ieee80211_action *ia;
 	struct ieee80211_frame *wh;
 	uint8_t *frm, *efrm;
 
 	/*
 	 * action frame format:
 	 *	[1] category
 	 *	[1] action
 	 *	[tlv] parameters
 	 */
 	wh = mtod(m, struct ieee80211_frame *);
 	frm = (u_int8_t *)&wh[1];
 	efrm = mtod(m, u_int8_t *) + m->m_len;
 	IEEE80211_VERIFY_LENGTH(efrm - frm,
 		sizeof(struct ieee80211_action), return EINVAL);
 	ia = (const struct ieee80211_action *) frm;
 
 	vap->iv_stats.is_rx_action++;
 	IEEE80211_NODE_STAT(ni, rx_action);
 
 	/* verify frame payloads but defer processing */
 	switch (ia->ia_category) {
 	case IEEE80211_ACTION_CAT_BA:
 		switch (ia->ia_action) {
 		case IEEE80211_ACTION_BA_ADDBA_REQUEST:
 			IEEE80211_VERIFY_LENGTH(efrm - frm,
 			    sizeof(struct ieee80211_action_ba_addbarequest),
 			    return EINVAL);
 			break;
 		case IEEE80211_ACTION_BA_ADDBA_RESPONSE:
 			IEEE80211_VERIFY_LENGTH(efrm - frm,
 			    sizeof(struct ieee80211_action_ba_addbaresponse),
 			    return EINVAL);
 			break;
 		case IEEE80211_ACTION_BA_DELBA:
 			IEEE80211_VERIFY_LENGTH(efrm - frm,
 			    sizeof(struct ieee80211_action_ba_delba),
 			    return EINVAL);
 			break;
 		}
 		break;
 	case IEEE80211_ACTION_CAT_HT:
 		switch (ia->ia_action) {
 		case IEEE80211_ACTION_HT_TXCHWIDTH:
 			IEEE80211_VERIFY_LENGTH(efrm - frm,
 			    sizeof(struct ieee80211_action_ht_txchwidth),
 			    return EINVAL);
 			break;
 		case IEEE80211_ACTION_HT_MIMOPWRSAVE:
 			IEEE80211_VERIFY_LENGTH(efrm - frm,
 			    sizeof(struct ieee80211_action_ht_mimopowersave),
 			    return EINVAL);
 			break;
 		}
 		break;
 #ifdef IEEE80211_SUPPORT_MESH
 	case IEEE80211_ACTION_CAT_MESH:
 		switch (ia->ia_action) {
 		case IEEE80211_ACTION_MESH_LMETRIC:
 			/*
 			 * XXX: verification is true only if we are using
 			 * Airtime link metric (default)
 			 */
 			IEEE80211_VERIFY_LENGTH(efrm - frm,
 			    sizeof(struct ieee80211_meshlmetric_ie),
 			    return EINVAL);
 			break;
 		case IEEE80211_ACTION_MESH_HWMP:
 			/* verify something */
 			break;
 		case IEEE80211_ACTION_MESH_GANN:
 			IEEE80211_VERIFY_LENGTH(efrm - frm,
 			    sizeof(struct ieee80211_meshgann_ie),
 			    return EINVAL);
 			break;
 		case IEEE80211_ACTION_MESH_CC:
 		case IEEE80211_ACTION_MESH_MCCA_SREQ:
 		case IEEE80211_ACTION_MESH_MCCA_SREP:
 		case IEEE80211_ACTION_MESH_MCCA_AREQ:
 		case IEEE80211_ACTION_MESH_MCCA_ADVER:
 		case IEEE80211_ACTION_MESH_MCCA_TRDOWN:
 		case IEEE80211_ACTION_MESH_TBTT_REQ:
 		case IEEE80211_ACTION_MESH_TBTT_RES:
 			/* reject these early on, not implemented */
 			IEEE80211_DISCARD(vap,
 			    IEEE80211_MSG_ELEMID | IEEE80211_MSG_INPUT,
 			    wh, NULL, "not implemented yet, act=0x%02X",
 			    ia->ia_action);
 			return EINVAL;
 		}
 		break;
 	case IEEE80211_ACTION_CAT_SELF_PROT:
 		/* If TA or RA group address discard silently */
 		if (IEEE80211_IS_MULTICAST(wh->i_addr1) ||
 			IEEE80211_IS_MULTICAST(wh->i_addr2))
 			return EINVAL;
 		/*
 		 * XXX: Should we verify complete length now or it is
 		 * to varying in sizes?
 		 */
 		switch (ia->ia_action) {
 		case IEEE80211_ACTION_MESHPEERING_CONFIRM:
 		case IEEE80211_ACTION_MESHPEERING_CLOSE:
 			/* is not a peering candidate (yet) */
 			if (ni == vap->iv_bss)
 				return EINVAL;
 			break;
 		}
 		break;
 #endif
 	case IEEE80211_ACTION_CAT_VHT:
 		printf("%s: TODO: VHT handling!\n", __func__);
 		break;
 	}
 	return 0;
 }
 
 #ifdef IEEE80211_DEBUG
 /*
  * Debugging support.
  */
 void
 ieee80211_ssid_mismatch(struct ieee80211vap *vap, const char *tag,
 	uint8_t mac[IEEE80211_ADDR_LEN], uint8_t *ssid)
 {
 	printf("[%s] discard %s frame, ssid mismatch: ",
 		ether_sprintf(mac), tag);
 	ieee80211_print_essid(ssid + 2, ssid[1]);
 	printf("\n");
 }
 
 /*
  * Return the bssid of a frame.
  */
 static const uint8_t *
 ieee80211_getbssid(const struct ieee80211vap *vap,
 	const struct ieee80211_frame *wh)
 {
 	if (vap->iv_opmode == IEEE80211_M_STA)
 		return wh->i_addr2;
 	if ((wh->i_fc[1] & IEEE80211_FC1_DIR_MASK) != IEEE80211_FC1_DIR_NODS)
 		return wh->i_addr1;
 	if ((wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK) == IEEE80211_FC0_SUBTYPE_PS_POLL)
 		return wh->i_addr1;
 	return wh->i_addr3;
 }
 
 #include <machine/stdarg.h>
 
 void
 ieee80211_note(const struct ieee80211vap *vap, const char *fmt, ...)
 {
 	char buf[256];		/* XXX */
 	va_list ap;
 	int len;
 
 	va_start(ap, fmt);
 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 
 	if_printf(vap->iv_ifp, "%s", buf);	/* NB: no \n */
 
 	if (len >= sizeof(buf))
 		printf("%s: XXX buffer too small: len = %d\n", __func__, len);
 }
 
 void
 ieee80211_note_frame(const struct ieee80211vap *vap,
 	const struct ieee80211_frame *wh,
 	const char *fmt, ...)
 {
 	char buf[256];		/* XXX */
 	va_list ap;
 	int len;
 
 	va_start(ap, fmt);
 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 	if_printf(vap->iv_ifp, "[%s] %s\n",
 		ether_sprintf(ieee80211_getbssid(vap, wh)), buf);
 
 	if (len >= sizeof(buf))
 		printf("%s: XXX buffer too small: len = %d\n", __func__, len);
 }
 
 void
 ieee80211_note_mac(const struct ieee80211vap *vap,
 	const uint8_t mac[IEEE80211_ADDR_LEN],
 	const char *fmt, ...)
 {
 	char buf[256];		/* XXX */
 	va_list ap;
 	int len;
 
 	va_start(ap, fmt);
 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 	if_printf(vap->iv_ifp, "[%s] %s\n", ether_sprintf(mac), buf);
 
 	if (len >= sizeof(buf))
 		printf("%s: XXX buffer too small: len = %d\n", __func__, len);
 }
 
 void
 ieee80211_discard_frame(const struct ieee80211vap *vap,
 	const struct ieee80211_frame *wh,
 	const char *type, const char *fmt, ...)
 {
 	char buf[256];		/* XXX */
 	va_list ap;
 	int len;
 
 	va_start(ap, fmt);
 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 
 	if_printf(vap->iv_ifp, "[%s] discard %s frame, %s\n",
 	    ether_sprintf(ieee80211_getbssid(vap, wh)),
 	    type != NULL ? type : ieee80211_mgt_subtype_name(wh->i_fc[0]),
 	    buf);
 
 	if (len >= sizeof(buf))
 		printf("%s: XXX buffer too small: len = %d\n", __func__, len);
 }
 
 void
 ieee80211_discard_ie(const struct ieee80211vap *vap,
 	const struct ieee80211_frame *wh,
 	const char *type, const char *fmt, ...)
 {
 	char buf[256];		/* XXX */
 	va_list ap;
 	int len;
 
 	va_start(ap, fmt);
 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 
 	if_printf(vap->iv_ifp, "[%s] discard%s%s information element, %s\n",
 	    ether_sprintf(ieee80211_getbssid(vap, wh)),
 	    type != NULL ? " " : "", type != NULL ? type : "", buf);
 
 	if (len >= sizeof(buf))
 		printf("%s: XXX buffer too small: len = %d\n", __func__, len);
 }
 
 void
 ieee80211_discard_mac(const struct ieee80211vap *vap,
 	const uint8_t mac[IEEE80211_ADDR_LEN],
 	const char *type, const char *fmt, ...)
 {
 	char buf[256];		/* XXX */
 	va_list ap;
 	int len;
 
 	va_start(ap, fmt);
 	len = vsnprintf(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
 
 	if_printf(vap->iv_ifp, "[%s] discard%s%s frame, %s\n",
 	    ether_sprintf(mac),
 	    type != NULL ? " " : "", type != NULL ? type : "", buf);
 
 	if (len >= sizeof(buf))
 		printf("%s: XXX buffer too small: len = %d\n", __func__, len);
 }
 #endif /* IEEE80211_DEBUG */
diff --git a/sys/net80211/ieee80211_ioctl.c b/sys/net80211/ieee80211_ioctl.c
index 839f965f542f..0fbbb74e1e65 100644
--- a/sys/net80211/ieee80211_ioctl.c
+++ b/sys/net80211/ieee80211_ioctl.c
@@ -1,3697 +1,3698 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001 Atsushi Onoe
  * Copyright (c) 2002-2009 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * IEEE 802.11 ioctl support (FreeBSD-specific)
  */
 
 #include "opt_inet.h"
 #include "opt_wlan.h"
 
 #include <sys/endian.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
+#include <net/if_private.h>
 #include <net/ethernet.h>
 
 #ifdef INET
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #endif
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_ioctl.h>
 #include <net80211/ieee80211_regdomain.h>
 #include <net80211/ieee80211_input.h>
 
 #define	IS_UP_AUTO(_vap) \
 	(IFNET_IS_UP_RUNNING((_vap)->iv_ifp) && \
 	 (_vap)->iv_roaming == IEEE80211_ROAMING_AUTO)
 
 static const uint8_t zerobssid[IEEE80211_ADDR_LEN];
 static struct ieee80211_channel *findchannel(struct ieee80211com *,
 		int ieee, int mode);
 static int ieee80211_scanreq(struct ieee80211vap *,
 		struct ieee80211_scan_req *);
 
 static int
 ieee80211_ioctl_getkey(u_long cmd, struct ieee80211vap *vap,
     struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_node *ni;
 	struct ieee80211req_key ik;
 	struct ieee80211_key *wk;
 	const struct ieee80211_cipher *cip;
 	u_int kid;
 	int error;
 
 	if (ireq->i_len != sizeof(ik))
 		return EINVAL;
 	error = copyin(ireq->i_data, &ik, sizeof(ik));
 	if (error)
 		return error;
 	kid = ik.ik_keyix;
 	if (kid == IEEE80211_KEYIX_NONE) {
 		ni = ieee80211_find_vap_node(&ic->ic_sta, vap, ik.ik_macaddr);
 		if (ni == NULL)
 			return ENOENT;
 		wk = &ni->ni_ucastkey;
 	} else {
 		if (kid >= IEEE80211_WEP_NKID)
 			return EINVAL;
 		wk = &vap->iv_nw_keys[kid];
 		IEEE80211_ADDR_COPY(&ik.ik_macaddr, vap->iv_bss->ni_macaddr);
 		ni = NULL;
 	}
 	cip = wk->wk_cipher;
 	ik.ik_type = cip->ic_cipher;
 	ik.ik_keylen = wk->wk_keylen;
 	ik.ik_flags = wk->wk_flags & (IEEE80211_KEY_XMIT | IEEE80211_KEY_RECV);
 	if (wk->wk_keyix == vap->iv_def_txkey)
 		ik.ik_flags |= IEEE80211_KEY_DEFAULT;
 	if (ieee80211_priv_check_vap_getkey(cmd, vap, NULL) == 0) {
 		/* NB: only root can read key data */
 		ik.ik_keyrsc = wk->wk_keyrsc[IEEE80211_NONQOS_TID];
 		ik.ik_keytsc = wk->wk_keytsc;
 		memcpy(ik.ik_keydata, wk->wk_key, wk->wk_keylen);
 		if (cip->ic_cipher == IEEE80211_CIPHER_TKIP) {
 			memcpy(ik.ik_keydata+wk->wk_keylen,
 				wk->wk_key + IEEE80211_KEYBUF_SIZE,
 				IEEE80211_MICBUF_SIZE);
 			ik.ik_keylen += IEEE80211_MICBUF_SIZE;
 		}
 	} else {
 		ik.ik_keyrsc = 0;
 		ik.ik_keytsc = 0;
 		memset(ik.ik_keydata, 0, sizeof(ik.ik_keydata));
 	}
 	if (ni != NULL)
 		ieee80211_free_node(ni);
 	return copyout(&ik, ireq->i_data, sizeof(ik));
 }
 
 static int
 ieee80211_ioctl_getchanlist(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	if (sizeof(ic->ic_chan_active) < ireq->i_len)
 		ireq->i_len = sizeof(ic->ic_chan_active);
 	return copyout(&ic->ic_chan_active, ireq->i_data, ireq->i_len);
 }
 
 static int
 ieee80211_ioctl_getchaninfo(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	uint32_t space;
 
 	space = __offsetof(struct ieee80211req_chaninfo,
 			ic_chans[ic->ic_nchans]);
 	if (space > ireq->i_len)
 		space = ireq->i_len;
 	/* XXX assumes compatible layout */
 	return copyout(&ic->ic_nchans, ireq->i_data, space);
 }
 
 static int
 ieee80211_ioctl_getwpaie(struct ieee80211vap *vap,
 	struct ieee80211req *ireq, int req)
 {
 	struct ieee80211_node *ni;
 	struct ieee80211req_wpaie2 *wpaie;
 	int error;
 
 	if (ireq->i_len < IEEE80211_ADDR_LEN)
 		return EINVAL;
 	wpaie = IEEE80211_MALLOC(sizeof(*wpaie), M_TEMP,
 	    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (wpaie == NULL)
 		return ENOMEM;
 	error = copyin(ireq->i_data, wpaie->wpa_macaddr, IEEE80211_ADDR_LEN);
 	if (error != 0)
 		goto bad;
 	ni = ieee80211_find_vap_node(&vap->iv_ic->ic_sta, vap, wpaie->wpa_macaddr);
 	if (ni == NULL) {
 		error = ENOENT;
 		goto bad;
 	}
 	if (ni->ni_ies.wpa_ie != NULL) {
 		int ielen = ni->ni_ies.wpa_ie[1] + 2;
 		if (ielen > sizeof(wpaie->wpa_ie))
 			ielen = sizeof(wpaie->wpa_ie);
 		memcpy(wpaie->wpa_ie, ni->ni_ies.wpa_ie, ielen);
 	}
 	if (req == IEEE80211_IOC_WPAIE2) {
 		if (ni->ni_ies.rsn_ie != NULL) {
 			int ielen = ni->ni_ies.rsn_ie[1] + 2;
 			if (ielen > sizeof(wpaie->rsn_ie))
 				ielen = sizeof(wpaie->rsn_ie);
 			memcpy(wpaie->rsn_ie, ni->ni_ies.rsn_ie, ielen);
 		}
 		if (ireq->i_len > sizeof(struct ieee80211req_wpaie2))
 			ireq->i_len = sizeof(struct ieee80211req_wpaie2);
 	} else {
 		/* compatibility op, may overwrite wpa ie */
 		/* XXX check ic_flags? */
 		if (ni->ni_ies.rsn_ie != NULL) {
 			int ielen = ni->ni_ies.rsn_ie[1] + 2;
 			if (ielen > sizeof(wpaie->wpa_ie))
 				ielen = sizeof(wpaie->wpa_ie);
 			memcpy(wpaie->wpa_ie, ni->ni_ies.rsn_ie, ielen);
 		}
 		if (ireq->i_len > sizeof(struct ieee80211req_wpaie))
 			ireq->i_len = sizeof(struct ieee80211req_wpaie);
 	}
 	ieee80211_free_node(ni);
 	error = copyout(wpaie, ireq->i_data, ireq->i_len);
 bad:
 	IEEE80211_FREE(wpaie, M_TEMP);
 	return error;
 }
 
 static int
 ieee80211_ioctl_getstastats(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_node *ni;
 	uint8_t macaddr[IEEE80211_ADDR_LEN];
 	const size_t off = __offsetof(struct ieee80211req_sta_stats, is_stats);
 	int error;
 
 	if (ireq->i_len < off)
 		return EINVAL;
 	error = copyin(ireq->i_data, macaddr, IEEE80211_ADDR_LEN);
 	if (error != 0)
 		return error;
 	ni = ieee80211_find_vap_node(&vap->iv_ic->ic_sta, vap, macaddr);
 	if (ni == NULL)
 		return ENOENT;
 	if (ireq->i_len > sizeof(struct ieee80211req_sta_stats))
 		ireq->i_len = sizeof(struct ieee80211req_sta_stats);
 	/* NB: copy out only the statistics */
 	error = copyout(&ni->ni_stats, (uint8_t *) ireq->i_data + off,
 			ireq->i_len - off);
 	ieee80211_free_node(ni);
 	return error;
 }
 
 struct scanreq {
 	struct ieee80211req_scan_result *sr;
 	size_t space;
 };
 
 static size_t
 scan_space(const struct ieee80211_scan_entry *se, int *ielen)
 {
 	size_t len;
 
 	*ielen = se->se_ies.len;
 	/*
 	 * NB: ie's can be no more than 255 bytes and the max 802.11
 	 * packet is <3Kbytes so we are sure this doesn't overflow
 	 * 16-bits; if this is a concern we can drop the ie's.
 	 */
 	len = sizeof(struct ieee80211req_scan_result) + se->se_ssid[1] +
 	    se->se_meshid[1] + *ielen;
 	return roundup(len, sizeof(uint32_t));
 }
 
 static void
 get_scan_space(void *arg, const struct ieee80211_scan_entry *se)
 {
 	struct scanreq *req = arg;
 	int ielen;
 
 	req->space += scan_space(se, &ielen);
 }
 
 static void
 get_scan_result(void *arg, const struct ieee80211_scan_entry *se)
 {
 	struct scanreq *req = arg;
 	struct ieee80211req_scan_result *sr;
 	int ielen, len, nr, nxr;
 	uint8_t *cp;
 
 	len = scan_space(se, &ielen);
 	if (len > req->space)
 		return;
 
 	sr = req->sr;
 	KASSERT(len <= 65535 && ielen <= 65535,
 	    ("len %u ssid %u ie %u", len, se->se_ssid[1], ielen));
 	sr->isr_len = len;
 	sr->isr_ie_off = sizeof(struct ieee80211req_scan_result);
 	sr->isr_ie_len = ielen;
 	sr->isr_freq = se->se_chan->ic_freq;
 	sr->isr_flags = se->se_chan->ic_flags;
 	sr->isr_rssi = se->se_rssi;
 	sr->isr_noise = se->se_noise;
 	sr->isr_intval = se->se_intval;
 	sr->isr_capinfo = se->se_capinfo;
 	sr->isr_erp = se->se_erp;
 	IEEE80211_ADDR_COPY(sr->isr_bssid, se->se_bssid);
 	nr = min(se->se_rates[1], IEEE80211_RATE_MAXSIZE);
 	memcpy(sr->isr_rates, se->se_rates+2, nr);
 	nxr = min(se->se_xrates[1], IEEE80211_RATE_MAXSIZE - nr);
 	memcpy(sr->isr_rates+nr, se->se_xrates+2, nxr);
 	sr->isr_nrates = nr + nxr;
 
 	/* copy SSID */
 	sr->isr_ssid_len = se->se_ssid[1];
 	cp = ((uint8_t *)sr) + sr->isr_ie_off;
 	memcpy(cp, se->se_ssid+2, sr->isr_ssid_len);
 
 	/* copy mesh id */
 	cp += sr->isr_ssid_len;
 	sr->isr_meshid_len = se->se_meshid[1];
 	memcpy(cp, se->se_meshid+2, sr->isr_meshid_len);
 	cp += sr->isr_meshid_len;
 
 	if (ielen)
 		memcpy(cp, se->se_ies.data, ielen);
 
 	req->space -= len;
 	req->sr = (struct ieee80211req_scan_result *)(((uint8_t *)sr) + len);
 }
 
 static int
 ieee80211_ioctl_getscanresults(struct ieee80211vap *vap,
 	struct ieee80211req *ireq)
 {
 	struct scanreq req;
 	int error;
 
 	if (ireq->i_len < sizeof(struct scanreq))
 		return EFAULT;
 
 	error = 0;
 	req.space = 0;
 	ieee80211_scan_iterate(vap, get_scan_space, &req);
 	if (req.space > ireq->i_len)
 		req.space = ireq->i_len;
 	if (req.space > 0) {
 		uint32_t space;
 		void *p;
 
 		space = req.space;
 		/* XXX IEEE80211_M_WAITOK after driver lock released */
 		p = IEEE80211_MALLOC(space, M_TEMP,
 		    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 		if (p == NULL)
 			return ENOMEM;
 		req.sr = p;
 		ieee80211_scan_iterate(vap, get_scan_result, &req);
 		ireq->i_len = space - req.space;
 		error = copyout(p, ireq->i_data, ireq->i_len);
 		IEEE80211_FREE(p, M_TEMP);
 	} else
 		ireq->i_len = 0;
 
 	return error;
 }
 
 struct stainforeq {
 	struct ieee80211req_sta_info *si;
 	size_t	space;
 };
 
 static size_t
 sta_space(const struct ieee80211_node *ni, size_t *ielen)
 {
 	*ielen = ni->ni_ies.len;
 	return roundup(sizeof(struct ieee80211req_sta_info) + *ielen,
 		      sizeof(uint32_t));
 }
 
 static void
 get_sta_space(void *arg, struct ieee80211_node *ni)
 {
 	struct stainforeq *req = arg;
 	size_t ielen;
 
 	if (ni->ni_vap->iv_opmode == IEEE80211_M_HOSTAP &&
 	    ni->ni_associd == 0)	/* only associated stations */
 		return;
 	req->space += sta_space(ni, &ielen);
 }
 
 static void
 get_sta_info(void *arg, struct ieee80211_node *ni)
 {
 	struct stainforeq *req = arg;
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211req_sta_info *si;
 	size_t ielen, len;
 	uint8_t *cp;
 
 	if (vap->iv_opmode == IEEE80211_M_HOSTAP &&
 	    ni->ni_associd == 0)	/* only associated stations */
 		return;
 	if (ni->ni_chan == IEEE80211_CHAN_ANYC)	/* XXX bogus entry */
 		return;
 	len = sta_space(ni, &ielen);
 	if (len > req->space)
 		return;
 	si = req->si;
 	si->isi_len = len;
 	si->isi_ie_off = sizeof(struct ieee80211req_sta_info);
 	si->isi_ie_len = ielen;
 	si->isi_freq = ni->ni_chan->ic_freq;
 	si->isi_flags = ni->ni_chan->ic_flags;
 	si->isi_state = ni->ni_flags;
 	si->isi_authmode = ni->ni_authmode;
 	vap->iv_ic->ic_node_getsignal(ni, &si->isi_rssi, &si->isi_noise);
 	vap->iv_ic->ic_node_getmimoinfo(ni, &si->isi_mimo);
 	si->isi_capinfo = ni->ni_capinfo;
 	si->isi_erp = ni->ni_erp;
 	IEEE80211_ADDR_COPY(si->isi_macaddr, ni->ni_macaddr);
 	si->isi_nrates = ni->ni_rates.rs_nrates;
 	if (si->isi_nrates > 15)
 		si->isi_nrates = 15;
 	memcpy(si->isi_rates, ni->ni_rates.rs_rates, si->isi_nrates);
 	si->isi_txrate = ni->ni_txrate;
 	if (si->isi_txrate & IEEE80211_RATE_MCS) {
 		const struct ieee80211_mcs_rates *mcs =
 		    &ieee80211_htrates[ni->ni_txrate &~ IEEE80211_RATE_MCS];
 		if (IEEE80211_IS_CHAN_HT40(ni->ni_chan)) {
 			if (ni->ni_flags & IEEE80211_NODE_SGI40)
 				si->isi_txmbps = mcs->ht40_rate_800ns;
 			else
 				si->isi_txmbps = mcs->ht40_rate_400ns;
 		} else {
 			if (ni->ni_flags & IEEE80211_NODE_SGI20)
 				si->isi_txmbps = mcs->ht20_rate_800ns;
 			else
 				si->isi_txmbps = mcs->ht20_rate_400ns;
 		}
 	} else
 		si->isi_txmbps = si->isi_txrate;
 	si->isi_associd = ni->ni_associd;
 	si->isi_txpower = ni->ni_txpower;
 	si->isi_vlan = ni->ni_vlan;
 	if (ni->ni_flags & IEEE80211_NODE_QOS) {
 		memcpy(si->isi_txseqs, ni->ni_txseqs, sizeof(ni->ni_txseqs));
 		memcpy(si->isi_rxseqs, ni->ni_rxseqs, sizeof(ni->ni_rxseqs));
 	} else {
 		si->isi_txseqs[0] = ni->ni_txseqs[IEEE80211_NONQOS_TID];
 		si->isi_rxseqs[0] = ni->ni_rxseqs[IEEE80211_NONQOS_TID];
 	}
 	/* NB: leave all cases in case we relax ni_associd == 0 check */
 	if (ieee80211_node_is_authorized(ni))
 		si->isi_inact = vap->iv_inact_run;
 	else if (ni->ni_associd != 0 ||
 	    (vap->iv_opmode == IEEE80211_M_WDS &&
 	     (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY)))
 		si->isi_inact = vap->iv_inact_auth;
 	else
 		si->isi_inact = vap->iv_inact_init;
 	si->isi_inact = (si->isi_inact - ni->ni_inact) * IEEE80211_INACT_WAIT;
 	si->isi_localid = ni->ni_mllid;
 	si->isi_peerid = ni->ni_mlpid;
 	si->isi_peerstate = ni->ni_mlstate;
 
 	if (ielen) {
 		cp = ((uint8_t *)si) + si->isi_ie_off;
 		memcpy(cp, ni->ni_ies.data, ielen);
 	}
 
 	req->si = (struct ieee80211req_sta_info *)(((uint8_t *)si) + len);
 	req->space -= len;
 }
 
 static int
 getstainfo_common(struct ieee80211vap *vap, struct ieee80211req *ireq,
 	struct ieee80211_node *ni, size_t off)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct stainforeq req;
 	size_t space;
 	void *p;
 	int error;
 
 	error = 0;
 	req.space = 0;
 	if (ni == NULL) {
 		ieee80211_iterate_nodes_vap(&ic->ic_sta, vap, get_sta_space,
 		    &req);
 	} else
 		get_sta_space(&req, ni);
 	if (req.space > ireq->i_len)
 		req.space = ireq->i_len;
 	if (req.space > 0) {
 		space = req.space;
 		/* XXX IEEE80211_M_WAITOK after driver lock released */
 		p = IEEE80211_MALLOC(space, M_TEMP,
 		    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 		if (p == NULL) {
 			error = ENOMEM;
 			goto bad;
 		}
 		req.si = p;
 		if (ni == NULL) {
 			ieee80211_iterate_nodes_vap(&ic->ic_sta, vap,
 			    get_sta_info, &req);
 		} else
 			get_sta_info(&req, ni);
 		ireq->i_len = space - req.space;
 		error = copyout(p, (uint8_t *) ireq->i_data+off, ireq->i_len);
 		IEEE80211_FREE(p, M_TEMP);
 	} else
 		ireq->i_len = 0;
 bad:
 	if (ni != NULL)
 		ieee80211_free_node(ni);
 	return error;
 }
 
 static int
 ieee80211_ioctl_getstainfo(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	uint8_t macaddr[IEEE80211_ADDR_LEN];
 	const size_t off = __offsetof(struct ieee80211req_sta_req, info);
 	struct ieee80211_node *ni;
 	int error;
 
 	if (ireq->i_len < sizeof(struct ieee80211req_sta_req))
 		return EFAULT;
 	error = copyin(ireq->i_data, macaddr, IEEE80211_ADDR_LEN);
 	if (error != 0)
 		return error;
 	if (IEEE80211_ADDR_EQ(macaddr, vap->iv_ifp->if_broadcastaddr)) {
 		ni = NULL;
 	} else {
 		ni = ieee80211_find_vap_node(&vap->iv_ic->ic_sta, vap, macaddr);
 		if (ni == NULL)
 			return ENOENT;
 	}
 	return getstainfo_common(vap, ireq, ni, off);
 }
 
 static int
 ieee80211_ioctl_getstatxpow(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_node *ni;
 	struct ieee80211req_sta_txpow txpow;
 	int error;
 
 	if (ireq->i_len != sizeof(txpow))
 		return EINVAL;
 	error = copyin(ireq->i_data, &txpow, sizeof(txpow));
 	if (error != 0)
 		return error;
 	ni = ieee80211_find_vap_node(&vap->iv_ic->ic_sta, vap, txpow.it_macaddr);
 	if (ni == NULL)
 		return ENOENT;
 	txpow.it_txpow = ni->ni_txpower;
 	error = copyout(&txpow, ireq->i_data, sizeof(txpow));
 	ieee80211_free_node(ni);
 	return error;
 }
 
 static int
 ieee80211_ioctl_getwmeparam(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_wme_state *wme = &ic->ic_wme;
 	struct wmeParams *wmep;
 	int ac;
 
 	if ((ic->ic_caps & IEEE80211_C_WME) == 0)
 		return EINVAL;
 
 	ac = (ireq->i_len & IEEE80211_WMEPARAM_VAL);
 	if (ac >= WME_NUM_AC)
 		ac = WME_AC_BE;
 	if (ireq->i_len & IEEE80211_WMEPARAM_BSS)
 		wmep = &wme->wme_wmeBssChanParams.cap_wmeParams[ac];
 	else
 		wmep = &wme->wme_wmeChanParams.cap_wmeParams[ac];
 	switch (ireq->i_type) {
 	case IEEE80211_IOC_WME_CWMIN:		/* WME: CWmin */
 		ireq->i_val = wmep->wmep_logcwmin;
 		break;
 	case IEEE80211_IOC_WME_CWMAX:		/* WME: CWmax */
 		ireq->i_val = wmep->wmep_logcwmax;
 		break;
 	case IEEE80211_IOC_WME_AIFS:		/* WME: AIFS */
 		ireq->i_val = wmep->wmep_aifsn;
 		break;
 	case IEEE80211_IOC_WME_TXOPLIMIT:	/* WME: txops limit */
 		ireq->i_val = wmep->wmep_txopLimit;
 		break;
 	case IEEE80211_IOC_WME_ACM:		/* WME: ACM (bss only) */
 		wmep = &wme->wme_wmeBssChanParams.cap_wmeParams[ac];
 		ireq->i_val = wmep->wmep_acm;
 		break;
 	case IEEE80211_IOC_WME_ACKPOLICY:	/* WME: ACK policy (!bss only)*/
 		wmep = &wme->wme_wmeChanParams.cap_wmeParams[ac];
 		ireq->i_val = !wmep->wmep_noackPolicy;
 		break;
 	}
 	return 0;
 }
 
 static int
 ieee80211_ioctl_getmaccmd(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	const struct ieee80211_aclator *acl = vap->iv_acl;
 
 	return (acl == NULL ? EINVAL : acl->iac_getioctl(vap, ireq));
 }
 
 static int
 ieee80211_ioctl_getcurchan(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_channel *c;
 
 	if (ireq->i_len != sizeof(struct ieee80211_channel))
 		return EINVAL;
 	/*
 	 * vap's may have different operating channels when HT is
 	 * in use.  When in RUN state report the vap-specific channel.
 	 * Otherwise return curchan.
 	 */
 	if (vap->iv_state == IEEE80211_S_RUN || vap->iv_state == IEEE80211_S_SLEEP)
 		c = vap->iv_bss->ni_chan;
 	else
 		c = ic->ic_curchan;
 	return copyout(c, ireq->i_data, sizeof(*c));
 }
 
 static int
 getappie(const struct ieee80211_appie *aie, struct ieee80211req *ireq)
 {
 	if (aie == NULL)
 		return EINVAL;
 	/* NB: truncate, caller can check length */
 	if (ireq->i_len > aie->ie_len)
 		ireq->i_len = aie->ie_len;
 	return copyout(aie->ie_data, ireq->i_data, ireq->i_len);
 }
 
 static int
 ieee80211_ioctl_getappie(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	uint8_t fc0;
 
 	fc0 = ireq->i_val & 0xff;
 	if ((fc0 & IEEE80211_FC0_TYPE_MASK) != IEEE80211_FC0_TYPE_MGT)
 		return EINVAL;
 	/* NB: could check iv_opmode and reject but hardly worth the effort */
 	switch (fc0 & IEEE80211_FC0_SUBTYPE_MASK) {
 	case IEEE80211_FC0_SUBTYPE_BEACON:
 		return getappie(vap->iv_appie_beacon, ireq);
 	case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 		return getappie(vap->iv_appie_proberesp, ireq);
 	case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 		return getappie(vap->iv_appie_assocresp, ireq);
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 		return getappie(vap->iv_appie_probereq, ireq);
 	case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 		return getappie(vap->iv_appie_assocreq, ireq);
 	case IEEE80211_FC0_SUBTYPE_BEACON|IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 		return getappie(vap->iv_appie_wpa, ireq);
 	}
 	return EINVAL;
 }
 
 static int
 ieee80211_ioctl_getregdomain(struct ieee80211vap *vap,
 	const struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	if (ireq->i_len != sizeof(ic->ic_regdomain))
 		return EINVAL;
 	return copyout(&ic->ic_regdomain, ireq->i_data,
 	    sizeof(ic->ic_regdomain));
 }
 
 static int
 ieee80211_ioctl_getroam(struct ieee80211vap *vap,
 	const struct ieee80211req *ireq)
 {
 	size_t len = ireq->i_len;
 	/* NB: accept short requests for backwards compat */
 	if (len > sizeof(vap->iv_roamparms))
 		len = sizeof(vap->iv_roamparms);
 	return copyout(vap->iv_roamparms, ireq->i_data, len);
 }
 
 static int
 ieee80211_ioctl_gettxparams(struct ieee80211vap *vap,
 	const struct ieee80211req *ireq)
 {
 	size_t len = ireq->i_len;
 	/* NB: accept short requests for backwards compat */
 	if (len > sizeof(vap->iv_txparms))
 		len = sizeof(vap->iv_txparms);
 	return copyout(vap->iv_txparms, ireq->i_data, len);
 }
 
 static int
 ieee80211_ioctl_getdevcaps(struct ieee80211com *ic,
 	const struct ieee80211req *ireq)
 {
 	struct ieee80211_devcaps_req *dc;
 	struct ieee80211req_chaninfo *ci;
 	int maxchans, error;
 
 	maxchans = 1 + ((ireq->i_len - sizeof(struct ieee80211_devcaps_req)) /
 	    sizeof(struct ieee80211_channel));
 	/* NB: require 1 so we know ic_nchans is accessible */
 	if (maxchans < 1)
 		return EINVAL;
 	/* constrain max request size, 2K channels is ~24Kbytes */
 	if (maxchans > 2048)
 		maxchans = 2048;
 	dc = (struct ieee80211_devcaps_req *)
 	    IEEE80211_MALLOC(IEEE80211_DEVCAPS_SIZE(maxchans), M_TEMP,
 	    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (dc == NULL)
 		return ENOMEM;
 	dc->dc_drivercaps = ic->ic_caps;
 	dc->dc_cryptocaps = ic->ic_cryptocaps;
 	dc->dc_htcaps = ic->ic_htcaps;
 	dc->dc_vhtcaps = ic->ic_vhtcaps;
 	ci = &dc->dc_chaninfo;
 	ic->ic_getradiocaps(ic, maxchans, &ci->ic_nchans, ci->ic_chans);
 	KASSERT(ci->ic_nchans <= maxchans,
 	    ("nchans %d maxchans %d", ci->ic_nchans, maxchans));
 	ieee80211_sort_channels(ci->ic_chans, ci->ic_nchans);
 	error = copyout(dc, ireq->i_data, IEEE80211_DEVCAPS_SPACE(dc));
 	IEEE80211_FREE(dc, M_TEMP);
 	return error;
 }
 
 static int
 ieee80211_ioctl_getstavlan(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_node *ni;
 	struct ieee80211req_sta_vlan vlan;
 	int error;
 
 	if (ireq->i_len != sizeof(vlan))
 		return EINVAL;
 	error = copyin(ireq->i_data, &vlan, sizeof(vlan));
 	if (error != 0)
 		return error;
 	if (!IEEE80211_ADDR_EQ(vlan.sv_macaddr, zerobssid)) {
 		ni = ieee80211_find_vap_node(&vap->iv_ic->ic_sta, vap,
 		    vlan.sv_macaddr);
 		if (ni == NULL)
 			return ENOENT;
 	} else
 		ni = ieee80211_ref_node(vap->iv_bss);
 	vlan.sv_vlan = ni->ni_vlan;
 	error = copyout(&vlan, ireq->i_data, sizeof(vlan));
 	ieee80211_free_node(ni);
 	return error;
 }
 
 /*
  * Dummy ioctl get handler so the linker set is defined.
  */
 static int
 dummy_ioctl_get(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	return ENOSYS;
 }
 IEEE80211_IOCTL_GET(dummy, dummy_ioctl_get);
 
 static int
 ieee80211_ioctl_getdefault(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	ieee80211_ioctl_getfunc * const *get;
 	int error;
 
 	SET_FOREACH(get, ieee80211_ioctl_getset) {
 		error = (*get)(vap, ireq);
 		if (error != ENOSYS)
 			return error;
 	}
 	return EINVAL;
 }
 
 static int
 ieee80211_ioctl_get80211(struct ieee80211vap *vap, u_long cmd,
     struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	u_int kid, len;
 	uint8_t tmpkey[IEEE80211_KEYBUF_SIZE];
 	char tmpssid[IEEE80211_NWID_LEN];
 	int error = 0;
 
 	switch (ireq->i_type) {
 	case IEEE80211_IOC_IC_NAME:
 		len = strlen(ic->ic_name) + 1;
 		if (len > ireq->i_len)
 			return (EINVAL);
 		ireq->i_len = len;
 		error = copyout(ic->ic_name, ireq->i_data, ireq->i_len);
 		break;
 	case IEEE80211_IOC_SSID:
 		switch (vap->iv_state) {
 		case IEEE80211_S_INIT:
 		case IEEE80211_S_SCAN:
 			ireq->i_len = vap->iv_des_ssid[0].len;
 			memcpy(tmpssid, vap->iv_des_ssid[0].ssid, ireq->i_len);
 			break;
 		default:
 			ireq->i_len = vap->iv_bss->ni_esslen;
 			memcpy(tmpssid, vap->iv_bss->ni_essid, ireq->i_len);
 			break;
 		}
 		error = copyout(tmpssid, ireq->i_data, ireq->i_len);
 		break;
 	case IEEE80211_IOC_NUMSSIDS:
 		ireq->i_val = 1;
 		break;
 	case IEEE80211_IOC_WEP:
 		if ((vap->iv_flags & IEEE80211_F_PRIVACY) == 0)
 			ireq->i_val = IEEE80211_WEP_OFF;
 		else if (vap->iv_flags & IEEE80211_F_DROPUNENC)
 			ireq->i_val = IEEE80211_WEP_ON;
 		else
 			ireq->i_val = IEEE80211_WEP_MIXED;
 		break;
 	case IEEE80211_IOC_WEPKEY:
 		kid = (u_int) ireq->i_val;
 		if (kid >= IEEE80211_WEP_NKID)
 			return EINVAL;
 		len = (u_int) vap->iv_nw_keys[kid].wk_keylen;
 		/* NB: only root can read WEP keys */
 		if (ieee80211_priv_check_vap_getkey(cmd, vap, NULL) == 0) {
 			bcopy(vap->iv_nw_keys[kid].wk_key, tmpkey, len);
 		} else {
 			bzero(tmpkey, len);
 		}
 		ireq->i_len = len;
 		error = copyout(tmpkey, ireq->i_data, len);
 		break;
 	case IEEE80211_IOC_NUMWEPKEYS:
 		ireq->i_val = IEEE80211_WEP_NKID;
 		break;
 	case IEEE80211_IOC_WEPTXKEY:
 		ireq->i_val = vap->iv_def_txkey;
 		break;
 	case IEEE80211_IOC_AUTHMODE:
 		if (vap->iv_flags & IEEE80211_F_WPA)
 			ireq->i_val = IEEE80211_AUTH_WPA;
 		else
 			ireq->i_val = vap->iv_bss->ni_authmode;
 		break;
 	case IEEE80211_IOC_CHANNEL:
 		ireq->i_val = ieee80211_chan2ieee(ic, ic->ic_curchan);
 		break;
 	case IEEE80211_IOC_POWERSAVE:
 		if (vap->iv_flags & IEEE80211_F_PMGTON)
 			ireq->i_val = IEEE80211_POWERSAVE_ON;
 		else
 			ireq->i_val = IEEE80211_POWERSAVE_OFF;
 		break;
 	case IEEE80211_IOC_POWERSAVESLEEP:
 		ireq->i_val = ic->ic_lintval;
 		break;
 	case IEEE80211_IOC_RTSTHRESHOLD:
 		ireq->i_val = vap->iv_rtsthreshold;
 		break;
 	case IEEE80211_IOC_PROTMODE:
 		ireq->i_val = vap->iv_protmode;
 		break;
 	case IEEE80211_IOC_TXPOWER:
 		/*
 		 * Tx power limit is the min of max regulatory
 		 * power, any user-set limit, and the max the
 		 * radio can do.
 		 *
 		 * TODO: methodize this
 		 */
 		ireq->i_val = 2*ic->ic_curchan->ic_maxregpower;
 		if (ireq->i_val > ic->ic_txpowlimit)
 			ireq->i_val = ic->ic_txpowlimit;
 		if (ireq->i_val > ic->ic_curchan->ic_maxpower)
 			ireq->i_val = ic->ic_curchan->ic_maxpower;
 		break;
 	case IEEE80211_IOC_WPA:
 		switch (vap->iv_flags & IEEE80211_F_WPA) {
 		case IEEE80211_F_WPA1:
 			ireq->i_val = 1;
 			break;
 		case IEEE80211_F_WPA2:
 			ireq->i_val = 2;
 			break;
 		case IEEE80211_F_WPA1 | IEEE80211_F_WPA2:
 			ireq->i_val = 3;
 			break;
 		default:
 			ireq->i_val = 0;
 			break;
 		}
 		break;
 	case IEEE80211_IOC_CHANLIST:
 		error = ieee80211_ioctl_getchanlist(vap, ireq);
 		break;
 	case IEEE80211_IOC_ROAMING:
 		ireq->i_val = vap->iv_roaming;
 		break;
 	case IEEE80211_IOC_PRIVACY:
 		ireq->i_val = (vap->iv_flags & IEEE80211_F_PRIVACY) != 0;
 		break;
 	case IEEE80211_IOC_DROPUNENCRYPTED:
 		ireq->i_val = (vap->iv_flags & IEEE80211_F_DROPUNENC) != 0;
 		break;
 	case IEEE80211_IOC_COUNTERMEASURES:
 		ireq->i_val = (vap->iv_flags & IEEE80211_F_COUNTERM) != 0;
 		break;
 	case IEEE80211_IOC_WME:
 		ireq->i_val = (vap->iv_flags & IEEE80211_F_WME) != 0;
 		break;
 	case IEEE80211_IOC_HIDESSID:
 		ireq->i_val = (vap->iv_flags & IEEE80211_F_HIDESSID) != 0;
 		break;
 	case IEEE80211_IOC_APBRIDGE:
 		ireq->i_val = (vap->iv_flags & IEEE80211_F_NOBRIDGE) == 0;
 		break;
 	case IEEE80211_IOC_WPAKEY:
 		error = ieee80211_ioctl_getkey(cmd, vap, ireq);
 		break;
 	case IEEE80211_IOC_CHANINFO:
 		error = ieee80211_ioctl_getchaninfo(vap, ireq);
 		break;
 	case IEEE80211_IOC_BSSID:
 		if (ireq->i_len != IEEE80211_ADDR_LEN)
 			return EINVAL;
 		if (vap->iv_state == IEEE80211_S_RUN || vap->iv_state == IEEE80211_S_SLEEP) {
 			error = copyout(vap->iv_opmode == IEEE80211_M_WDS ?
 			    vap->iv_bss->ni_macaddr : vap->iv_bss->ni_bssid,
 			    ireq->i_data, ireq->i_len);
 		} else
 			error = copyout(vap->iv_des_bssid, ireq->i_data,
 			    ireq->i_len);
 		break;
 	case IEEE80211_IOC_WPAIE:
 	case IEEE80211_IOC_WPAIE2:
 		error = ieee80211_ioctl_getwpaie(vap, ireq, ireq->i_type);
 		break;
 	case IEEE80211_IOC_SCAN_RESULTS:
 		error = ieee80211_ioctl_getscanresults(vap, ireq);
 		break;
 	case IEEE80211_IOC_STA_STATS:
 		error = ieee80211_ioctl_getstastats(vap, ireq);
 		break;
 	case IEEE80211_IOC_TXPOWMAX:
 		ireq->i_val = vap->iv_bss->ni_txpower;
 		break;
 	case IEEE80211_IOC_STA_TXPOW:
 		error = ieee80211_ioctl_getstatxpow(vap, ireq);
 		break;
 	case IEEE80211_IOC_STA_INFO:
 		error = ieee80211_ioctl_getstainfo(vap, ireq);
 		break;
 	case IEEE80211_IOC_WME_CWMIN:		/* WME: CWmin */
 	case IEEE80211_IOC_WME_CWMAX:		/* WME: CWmax */
 	case IEEE80211_IOC_WME_AIFS:		/* WME: AIFS */
 	case IEEE80211_IOC_WME_TXOPLIMIT:	/* WME: txops limit */
 	case IEEE80211_IOC_WME_ACM:		/* WME: ACM (bss only) */
 	case IEEE80211_IOC_WME_ACKPOLICY:	/* WME: ACK policy (!bss only) */
 		error = ieee80211_ioctl_getwmeparam(vap, ireq);
 		break;
 	case IEEE80211_IOC_DTIM_PERIOD:
 		ireq->i_val = vap->iv_dtim_period;
 		break;
 	case IEEE80211_IOC_BEACON_INTERVAL:
 		/* NB: get from ic_bss for station mode */
 		ireq->i_val = vap->iv_bss->ni_intval;
 		break;
 	case IEEE80211_IOC_PUREG:
 		ireq->i_val = (vap->iv_flags & IEEE80211_F_PUREG) != 0;
 		break;
 	case IEEE80211_IOC_QUIET:
 		ireq->i_val = vap->iv_quiet;
 		break;
 	case IEEE80211_IOC_QUIET_COUNT:
 		ireq->i_val = vap->iv_quiet_count;
 		break;
 	case IEEE80211_IOC_QUIET_PERIOD:
 		ireq->i_val = vap->iv_quiet_period;
 		break;
 	case IEEE80211_IOC_QUIET_DUR:
 		ireq->i_val = vap->iv_quiet_duration;
 		break;
 	case IEEE80211_IOC_QUIET_OFFSET:
 		ireq->i_val = vap->iv_quiet_offset;
 		break;
 	case IEEE80211_IOC_BGSCAN:
 		ireq->i_val = (vap->iv_flags & IEEE80211_F_BGSCAN) != 0;
 		break;
 	case IEEE80211_IOC_BGSCAN_IDLE:
 		ireq->i_val = vap->iv_bgscanidle*hz/1000;	/* ms */
 		break;
 	case IEEE80211_IOC_BGSCAN_INTERVAL:
 		ireq->i_val = vap->iv_bgscanintvl/hz;		/* seconds */
 		break;
 	case IEEE80211_IOC_SCANVALID:
 		ireq->i_val = vap->iv_scanvalid/hz;		/* seconds */
 		break;
 	case IEEE80211_IOC_FRAGTHRESHOLD:
 		ireq->i_val = vap->iv_fragthreshold;
 		break;
 	case IEEE80211_IOC_MACCMD:
 		error = ieee80211_ioctl_getmaccmd(vap, ireq);
 		break;
 	case IEEE80211_IOC_BURST:
 		ireq->i_val = (vap->iv_flags & IEEE80211_F_BURST) != 0;
 		break;
 	case IEEE80211_IOC_BMISSTHRESHOLD:
 		ireq->i_val = vap->iv_bmissthreshold;
 		break;
 	case IEEE80211_IOC_CURCHAN:
 		error = ieee80211_ioctl_getcurchan(vap, ireq);
 		break;
 	case IEEE80211_IOC_SHORTGI:
 		ireq->i_val = 0;
 		if (vap->iv_flags_ht & IEEE80211_FHT_SHORTGI20)
 			ireq->i_val |= IEEE80211_HTCAP_SHORTGI20;
 		if (vap->iv_flags_ht & IEEE80211_FHT_SHORTGI40)
 			ireq->i_val |= IEEE80211_HTCAP_SHORTGI40;
 		break;
 	case IEEE80211_IOC_AMPDU:
 		ireq->i_val = 0;
 		if (vap->iv_flags_ht & IEEE80211_FHT_AMPDU_TX)
 			ireq->i_val |= 1;
 		if (vap->iv_flags_ht & IEEE80211_FHT_AMPDU_RX)
 			ireq->i_val |= 2;
 		break;
 	case IEEE80211_IOC_AMPDU_LIMIT:
 		/* XXX TODO: make this a per-node thing; and leave this as global */
 		if (vap->iv_opmode == IEEE80211_M_HOSTAP)
 			ireq->i_val = vap->iv_ampdu_rxmax;
 		else if (vap->iv_state == IEEE80211_S_RUN || vap->iv_state == IEEE80211_S_SLEEP)
 			/*
 			 * XXX TODO: this isn't completely correct, as we've
 			 * negotiated the higher of the two.
 			 */
 			ireq->i_val = _IEEE80211_MASKSHIFT( vap->iv_bss->ni_htparam,
 			    IEEE80211_HTCAP_MAXRXAMPDU);
 		else
 			ireq->i_val = vap->iv_ampdu_limit;
 		break;
 	case IEEE80211_IOC_AMPDU_DENSITY:
 		/* XXX TODO: make this a per-node thing; and leave this as global */
 		if (vap->iv_opmode == IEEE80211_M_STA &&
 		    (vap->iv_state == IEEE80211_S_RUN || vap->iv_state == IEEE80211_S_SLEEP))
 			/*
 			 * XXX TODO: this isn't completely correct, as we've
 			 * negotiated the higher of the two.
 			 */
 			ireq->i_val = _IEEE80211_MASKSHIFT(vap->iv_bss->ni_htparam,
 			    IEEE80211_HTCAP_MPDUDENSITY);
 		else
 			ireq->i_val = vap->iv_ampdu_density;
 		break;
 	case IEEE80211_IOC_AMSDU:
 		ireq->i_val = 0;
 		if (vap->iv_flags_ht & IEEE80211_FHT_AMSDU_TX)
 			ireq->i_val |= 1;
 		if (vap->iv_flags_ht & IEEE80211_FHT_AMSDU_RX)
 			ireq->i_val |= 2;
 		break;
 	case IEEE80211_IOC_AMSDU_LIMIT:
 		ireq->i_val = vap->iv_amsdu_limit;	/* XXX truncation? */
 		break;
 	case IEEE80211_IOC_PUREN:
 		ireq->i_val = (vap->iv_flags_ht & IEEE80211_FHT_PUREN) != 0;
 		break;
 	case IEEE80211_IOC_DOTH:
 		ireq->i_val = (vap->iv_flags & IEEE80211_F_DOTH) != 0;
 		break;
 	case IEEE80211_IOC_REGDOMAIN:
 		error = ieee80211_ioctl_getregdomain(vap, ireq);
 		break;
 	case IEEE80211_IOC_ROAM:
 		error = ieee80211_ioctl_getroam(vap, ireq);
 		break;
 	case IEEE80211_IOC_TXPARAMS:
 		error = ieee80211_ioctl_gettxparams(vap, ireq);
 		break;
 	case IEEE80211_IOC_HTCOMPAT:
 		ireq->i_val = (vap->iv_flags_ht & IEEE80211_FHT_HTCOMPAT) != 0;
 		break;
 	case IEEE80211_IOC_DWDS:
 		ireq->i_val = (vap->iv_flags & IEEE80211_F_DWDS) != 0;
 		break;
 	case IEEE80211_IOC_INACTIVITY:
 		ireq->i_val = (vap->iv_flags_ext & IEEE80211_FEXT_INACT) != 0;
 		break;
 	case IEEE80211_IOC_APPIE:
 		error = ieee80211_ioctl_getappie(vap, ireq);
 		break;
 	case IEEE80211_IOC_WPS:
 		ireq->i_val = (vap->iv_flags_ext & IEEE80211_FEXT_WPS) != 0;
 		break;
 	case IEEE80211_IOC_TSN:
 		ireq->i_val = (vap->iv_flags_ext & IEEE80211_FEXT_TSN) != 0;
 		break;
 	case IEEE80211_IOC_DFS:
 		ireq->i_val = (vap->iv_flags_ext & IEEE80211_FEXT_DFS) != 0;
 		break;
 	case IEEE80211_IOC_DOTD:
 		ireq->i_val = (vap->iv_flags_ext & IEEE80211_FEXT_DOTD) != 0;
 		break;
 	case IEEE80211_IOC_DEVCAPS:
 		error = ieee80211_ioctl_getdevcaps(ic, ireq);
 		break;
 	case IEEE80211_IOC_HTPROTMODE:
 		ireq->i_val = vap->iv_htprotmode;
 		break;
 	case IEEE80211_IOC_HTCONF:
 		if (vap->iv_flags_ht & IEEE80211_FHT_HT) {
 			ireq->i_val = 1;
 			if (vap->iv_flags_ht & IEEE80211_FHT_USEHT40)
 				ireq->i_val |= 2;
 		} else
 			ireq->i_val = 0;
 		break;
 	case IEEE80211_IOC_STA_VLAN:
 		error = ieee80211_ioctl_getstavlan(vap, ireq);
 		break;
 	case IEEE80211_IOC_SMPS:
 		if (vap->iv_opmode == IEEE80211_M_STA &&
 		    (vap->iv_state == IEEE80211_S_RUN || vap->iv_state == IEEE80211_S_SLEEP)) {
 			if (vap->iv_bss->ni_flags & IEEE80211_NODE_MIMO_RTS)
 				ireq->i_val = IEEE80211_HTCAP_SMPS_DYNAMIC;
 			else if (vap->iv_bss->ni_flags & IEEE80211_NODE_MIMO_PS)
 				ireq->i_val = IEEE80211_HTCAP_SMPS_ENA;
 			else
 				ireq->i_val = IEEE80211_HTCAP_SMPS_OFF;
 		} else
 			ireq->i_val = vap->iv_htcaps & IEEE80211_HTCAP_SMPS;
 		break;
 	case IEEE80211_IOC_RIFS:
 		if (vap->iv_opmode == IEEE80211_M_STA &&
 		    (vap->iv_state == IEEE80211_S_RUN || vap->iv_state == IEEE80211_S_SLEEP))
 			ireq->i_val =
 			    (vap->iv_bss->ni_flags & IEEE80211_NODE_RIFS) != 0;
 		else
 			ireq->i_val =
 			    (vap->iv_flags_ht & IEEE80211_FHT_RIFS) != 0;
 		break;
 	case IEEE80211_IOC_STBC:
 		ireq->i_val = 0;
 		if (vap->iv_flags_ht & IEEE80211_FHT_STBC_TX)
 			ireq->i_val |= 1;
 		if (vap->iv_flags_ht & IEEE80211_FHT_STBC_RX)
 			ireq->i_val |= 2;
 		break;
 	case IEEE80211_IOC_LDPC:
 		ireq->i_val = 0;
 		if (vap->iv_flags_ht & IEEE80211_FHT_LDPC_TX)
 			ireq->i_val |= 1;
 		if (vap->iv_flags_ht & IEEE80211_FHT_LDPC_RX)
 			ireq->i_val |= 2;
 		break;
 	case IEEE80211_IOC_UAPSD:
 		ireq->i_val = 0;
 		if (vap->iv_flags_ext & IEEE80211_FEXT_UAPSD)
 			ireq->i_val = 1;
 		break;
 	case IEEE80211_IOC_VHTCONF:
 		ireq->i_val = vap->iv_flags_vht & IEEE80211_FVHT_MASK;
 		break;
 	default:
 		error = ieee80211_ioctl_getdefault(vap, ireq);
 		break;
 	}
 	return error;
 }
 
 static int
 ieee80211_ioctl_setkey(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211req_key ik;
 	struct ieee80211_node *ni;
 	struct ieee80211_key *wk;
 	uint16_t kid;
 	int error, i;
 
 	if (ireq->i_len != sizeof(ik))
 		return EINVAL;
 	error = copyin(ireq->i_data, &ik, sizeof(ik));
 	if (error)
 		return error;
 	/* NB: cipher support is verified by ieee80211_crypt_newkey */
 	/* NB: this also checks ik->ik_keylen > sizeof(wk->wk_key) */
 	if (ik.ik_keylen > sizeof(ik.ik_keydata))
 		return E2BIG;
 	kid = ik.ik_keyix;
 	if (kid == IEEE80211_KEYIX_NONE) {
 		/* XXX unicast keys currently must be tx/rx */
 		if (ik.ik_flags != (IEEE80211_KEY_XMIT | IEEE80211_KEY_RECV))
 			return EINVAL;
 		if (vap->iv_opmode == IEEE80211_M_STA) {
 			ni = ieee80211_ref_node(vap->iv_bss);
 			if (!IEEE80211_ADDR_EQ(ik.ik_macaddr, ni->ni_bssid)) {
 				ieee80211_free_node(ni);
 				return EADDRNOTAVAIL;
 			}
 		} else {
 			ni = ieee80211_find_vap_node(&vap->iv_ic->ic_sta, vap,
 				ik.ik_macaddr);
 			if (ni == NULL)
 				return ENOENT;
 		}
 		wk = &ni->ni_ucastkey;
 	} else {
 		if (kid >= IEEE80211_WEP_NKID)
 			return EINVAL;
 		wk = &vap->iv_nw_keys[kid];
 		/*
 		 * Global slots start off w/o any assigned key index.
 		 * Force one here for consistency with IEEE80211_IOC_WEPKEY.
 		 */
 		if (wk->wk_keyix == IEEE80211_KEYIX_NONE)
 			wk->wk_keyix = kid;
 		ni = NULL;
 	}
 	error = 0;
 	ieee80211_key_update_begin(vap);
 	if (ieee80211_crypto_newkey(vap, ik.ik_type, ik.ik_flags, wk)) {
 		wk->wk_keylen = ik.ik_keylen;
 		/* NB: MIC presence is implied by cipher type */
 		if (wk->wk_keylen > IEEE80211_KEYBUF_SIZE)
 			wk->wk_keylen = IEEE80211_KEYBUF_SIZE;
 		for (i = 0; i < IEEE80211_TID_SIZE; i++)
 			wk->wk_keyrsc[i] = ik.ik_keyrsc;
 		wk->wk_keytsc = 0;			/* new key, reset */
 		memset(wk->wk_key, 0, sizeof(wk->wk_key));
 		memcpy(wk->wk_key, ik.ik_keydata, ik.ik_keylen);
 		IEEE80211_ADDR_COPY(wk->wk_macaddr,
 		    ni != NULL ?  ni->ni_macaddr : ik.ik_macaddr);
 		if (!ieee80211_crypto_setkey(vap, wk))
 			error = EIO;
 		else if ((ik.ik_flags & IEEE80211_KEY_DEFAULT))
 			/*
 			 * Inform the driver that this is the default
 			 * transmit key.  Now, ideally we'd just set
 			 * a flag in the key update that would
 			 * say "yes, we're the default key", but
 			 * that currently isn't the way the ioctl ->
 			 * key interface works.
 			 */
 			ieee80211_crypto_set_deftxkey(vap, kid);
 	} else
 		error = ENXIO;
 	ieee80211_key_update_end(vap);
 	if (ni != NULL)
 		ieee80211_free_node(ni);
 	return error;
 }
 
 static int
 ieee80211_ioctl_delkey(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211req_del_key dk;
 	int kid, error;
 
 	if (ireq->i_len != sizeof(dk))
 		return EINVAL;
 	error = copyin(ireq->i_data, &dk, sizeof(dk));
 	if (error)
 		return error;
 	kid = dk.idk_keyix;
 	/* XXX uint8_t -> uint16_t */
 	if (dk.idk_keyix == (uint8_t) IEEE80211_KEYIX_NONE) {
 		struct ieee80211_node *ni;
 
 		if (vap->iv_opmode == IEEE80211_M_STA) {
 			ni = ieee80211_ref_node(vap->iv_bss);
 			if (!IEEE80211_ADDR_EQ(dk.idk_macaddr, ni->ni_bssid)) {
 				ieee80211_free_node(ni);
 				return EADDRNOTAVAIL;
 			}
 		} else {
 			ni = ieee80211_find_vap_node(&vap->iv_ic->ic_sta, vap,
 				dk.idk_macaddr);
 			if (ni == NULL)
 				return ENOENT;
 		}
 		/* XXX error return */
 		ieee80211_node_delucastkey(ni);
 		ieee80211_free_node(ni);
 	} else {
 		if (kid >= IEEE80211_WEP_NKID)
 			return EINVAL;
 		/* XXX error return */
 		ieee80211_crypto_delkey(vap, &vap->iv_nw_keys[kid]);
 	}
 	return 0;
 }
 
 struct mlmeop {
 	struct ieee80211vap *vap;
 	int	op;
 	int	reason;
 };
 
 static void
 mlmedebug(struct ieee80211vap *vap, const uint8_t mac[IEEE80211_ADDR_LEN],
 	int op, int reason)
 {
 #ifdef IEEE80211_DEBUG
 	static const struct {
 		int mask;
 		const char *opstr;
 	} ops[] = {
 		{ 0, "op#0" },
 		{ IEEE80211_MSG_IOCTL | IEEE80211_MSG_STATE |
 		  IEEE80211_MSG_ASSOC, "assoc" },
 		{ IEEE80211_MSG_IOCTL | IEEE80211_MSG_STATE |
 		  IEEE80211_MSG_ASSOC, "disassoc" },
 		{ IEEE80211_MSG_IOCTL | IEEE80211_MSG_STATE |
 		  IEEE80211_MSG_AUTH, "deauth" },
 		{ IEEE80211_MSG_IOCTL | IEEE80211_MSG_STATE |
 		  IEEE80211_MSG_AUTH, "authorize" },
 		{ IEEE80211_MSG_IOCTL | IEEE80211_MSG_STATE |
 		  IEEE80211_MSG_AUTH, "unauthorize" },
 	};
 
 	if (op == IEEE80211_MLME_AUTH) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_IOCTL |
 		    IEEE80211_MSG_STATE | IEEE80211_MSG_AUTH, mac,
 		    "station authenticate %s via MLME (reason: %d (%s))",
 		    reason == IEEE80211_STATUS_SUCCESS ? "ACCEPT" : "REJECT",
 		    reason, ieee80211_reason_to_string(reason));
 	} else if (!(IEEE80211_MLME_ASSOC <= op && op <= IEEE80211_MLME_AUTH)) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_ANY, mac,
 		    "unknown MLME request %d (reason: %d (%s))", op, reason,
 		    ieee80211_reason_to_string(reason));
 	} else if (reason == IEEE80211_STATUS_SUCCESS) {
 		IEEE80211_NOTE_MAC(vap, ops[op].mask, mac,
 		    "station %s via MLME", ops[op].opstr);
 	} else {
 		IEEE80211_NOTE_MAC(vap, ops[op].mask, mac,
 		    "station %s via MLME (reason: %d (%s))", ops[op].opstr,
 		    reason, ieee80211_reason_to_string(reason));
 	}
 #endif /* IEEE80211_DEBUG */
 }
 
 static void
 domlme(void *arg, struct ieee80211_node *ni)
 {
 	struct mlmeop *mop = arg;
 	struct ieee80211vap *vap = ni->ni_vap;
 
 	if (vap != mop->vap)
 		return;
 	/*
 	 * NB: if ni_associd is zero then the node is already cleaned
 	 * up and we don't need to do this (we're safely holding a
 	 * reference but should otherwise not modify it's state).
 	 */ 
 	if (ni->ni_associd == 0)
 		return;
 	mlmedebug(vap, ni->ni_macaddr, mop->op, mop->reason);
 	if (mop->op == IEEE80211_MLME_DEAUTH) {
 		IEEE80211_SEND_MGMT(ni, IEEE80211_FC0_SUBTYPE_DEAUTH,
 		    mop->reason);
 	} else {
 		IEEE80211_SEND_MGMT(ni, IEEE80211_FC0_SUBTYPE_DISASSOC,
 		    mop->reason);
 	}
 	ieee80211_node_leave(ni);
 }
 
 static int
 setmlme_dropsta(struct ieee80211vap *vap,
 	const uint8_t mac[IEEE80211_ADDR_LEN], struct mlmeop *mlmeop)
 {
 	struct ieee80211_node_table *nt = &vap->iv_ic->ic_sta;
 	struct ieee80211_node *ni;
 	int error = 0;
 
 	/* NB: the broadcast address means do 'em all */
 	if (!IEEE80211_ADDR_EQ(mac, vap->iv_ifp->if_broadcastaddr)) {
 		IEEE80211_NODE_LOCK(nt);
 		ni = ieee80211_find_node_locked(nt, mac);
 		IEEE80211_NODE_UNLOCK(nt);
 		/*
 		 * Don't do the node update inside the node
 		 * table lock.  This unfortunately causes LORs
 		 * with drivers and their TX paths.
 		 */
 		if (ni != NULL) {
 			domlme(mlmeop, ni);
 			ieee80211_free_node(ni);
 		} else
 			error = ENOENT;
 	} else {
 		ieee80211_iterate_nodes(nt, domlme, mlmeop);
 	}
 	return error;
 }
 
 static int
 setmlme_common(struct ieee80211vap *vap, int op,
 	const uint8_t mac[IEEE80211_ADDR_LEN], int reason)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_node_table *nt = &ic->ic_sta;
 	struct ieee80211_node *ni;
 	struct mlmeop mlmeop;
 	int error;
 
 	error = 0;
 	switch (op) {
 	case IEEE80211_MLME_DISASSOC:
 	case IEEE80211_MLME_DEAUTH:
 		switch (vap->iv_opmode) {
 		case IEEE80211_M_STA:
 			mlmedebug(vap, vap->iv_bss->ni_macaddr, op, reason);
 			/* XXX not quite right */
 			ieee80211_new_state(vap, IEEE80211_S_INIT, reason);
 			break;
 		case IEEE80211_M_HOSTAP:
 			mlmeop.vap = vap;
 			mlmeop.op = op;
 			mlmeop.reason = reason;
 			error = setmlme_dropsta(vap, mac, &mlmeop);
 			break;
 		case IEEE80211_M_WDS:
 			/* XXX user app should send raw frame? */
 			if (op != IEEE80211_MLME_DEAUTH) {
 				error = EINVAL;
 				break;
 			}
 #if 0
 			/* XXX accept any address, simplifies user code */
 			if (!IEEE80211_ADDR_EQ(mac, vap->iv_bss->ni_macaddr)) {
 				error = EINVAL;
 				break;
 			}
 #endif
 			mlmedebug(vap, vap->iv_bss->ni_macaddr, op, reason);
 			ni = ieee80211_ref_node(vap->iv_bss);
 			IEEE80211_SEND_MGMT(ni,
 			    IEEE80211_FC0_SUBTYPE_DEAUTH, reason);
 			ieee80211_free_node(ni);
 			break;
 		case IEEE80211_M_MBSS:
 			IEEE80211_NODE_LOCK(nt);
 			ni = ieee80211_find_node_locked(nt, mac);
 			/*
 			 * Don't do the node update inside the node
 			 * table lock.  This unfortunately causes LORs
 			 * with drivers and their TX paths.
 			 */
 			IEEE80211_NODE_UNLOCK(nt);
 			if (ni != NULL) {
 				ieee80211_node_leave(ni);
 				ieee80211_free_node(ni);
 			} else {
 				error = ENOENT;
 			}
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	case IEEE80211_MLME_AUTHORIZE:
 	case IEEE80211_MLME_UNAUTHORIZE:
 		if (vap->iv_opmode != IEEE80211_M_HOSTAP &&
 		    vap->iv_opmode != IEEE80211_M_WDS) {
 			error = EINVAL;
 			break;
 		}
 		IEEE80211_NODE_LOCK(nt);
 		ni = ieee80211_find_vap_node_locked(nt, vap, mac);
 		/*
 		 * Don't do the node update inside the node
 		 * table lock.  This unfortunately causes LORs
 		 * with drivers and their TX paths.
 		 */
 		IEEE80211_NODE_UNLOCK(nt);
 		if (ni != NULL) {
 			mlmedebug(vap, mac, op, reason);
 			if (op == IEEE80211_MLME_AUTHORIZE)
 				ieee80211_node_authorize(ni);
 			else
 				ieee80211_node_unauthorize(ni);
 			ieee80211_free_node(ni);
 		} else
 			error = ENOENT;
 		break;
 	case IEEE80211_MLME_AUTH:
 		if (vap->iv_opmode != IEEE80211_M_HOSTAP) {
 			error = EINVAL;
 			break;
 		}
 		IEEE80211_NODE_LOCK(nt);
 		ni = ieee80211_find_vap_node_locked(nt, vap, mac);
 		/*
 		 * Don't do the node update inside the node
 		 * table lock.  This unfortunately causes LORs
 		 * with drivers and their TX paths.
 		 */
 		IEEE80211_NODE_UNLOCK(nt);
 		if (ni != NULL) {
 			mlmedebug(vap, mac, op, reason);
 			if (reason == IEEE80211_STATUS_SUCCESS) {
 				IEEE80211_SEND_MGMT(ni,
 				    IEEE80211_FC0_SUBTYPE_AUTH, 2);
 				/*
 				 * For shared key auth, just continue the
 				 * exchange.  Otherwise when 802.1x is not in
 				 * use mark the port authorized at this point
 				 * so traffic can flow.
 				 */
 				if (ni->ni_authmode != IEEE80211_AUTH_8021X &&
 				    ni->ni_challenge == NULL)
 				      ieee80211_node_authorize(ni);
 			} else {
 				vap->iv_stats.is_rx_acl++;
 				ieee80211_send_error(ni, ni->ni_macaddr,
 				    IEEE80211_FC0_SUBTYPE_AUTH, 2|(reason<<16));
 				ieee80211_node_leave(ni);
 			}
 			ieee80211_free_node(ni);
 		} else
 			error = ENOENT;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return error;
 }
 
 struct scanlookup {
 	const uint8_t *mac;
 	int esslen;
 	const uint8_t *essid;
 	const struct ieee80211_scan_entry *se;
 };
 
 /*
  * Match mac address and any ssid.
  */
 static void
 mlmelookup(void *arg, const struct ieee80211_scan_entry *se)
 {
 	struct scanlookup *look = arg;
 
 	if (!IEEE80211_ADDR_EQ(look->mac, se->se_macaddr))
 		return;
 	if (look->esslen != 0) {
 		if (se->se_ssid[1] != look->esslen)
 			return;
 		if (memcmp(look->essid, se->se_ssid+2, look->esslen))
 			return;
 	}
 	look->se = se;
 }
 
 static int
 setmlme_assoc_sta(struct ieee80211vap *vap,
 	const uint8_t mac[IEEE80211_ADDR_LEN], int ssid_len,
 	const uint8_t ssid[IEEE80211_NWID_LEN])
 {
 	struct scanlookup lookup;
 
 	KASSERT(vap->iv_opmode == IEEE80211_M_STA,
 	    ("expected opmode STA not %s",
 	    ieee80211_opmode_name[vap->iv_opmode]));
 
 	/* NB: this is racey if roaming is !manual */
 	lookup.se = NULL;
 	lookup.mac = mac;
 	lookup.esslen = ssid_len;
 	lookup.essid = ssid;
 	ieee80211_scan_iterate(vap, mlmelookup, &lookup);
 	if (lookup.se == NULL)
 		return ENOENT;
 	mlmedebug(vap, mac, IEEE80211_MLME_ASSOC, 0);
 	if (!ieee80211_sta_join(vap, lookup.se->se_chan, lookup.se))
 		return EIO;		/* XXX unique but could be better */
 	return 0;
 }
 
 static int
 setmlme_assoc_adhoc(struct ieee80211vap *vap,
 	const uint8_t mac[IEEE80211_ADDR_LEN], int ssid_len,
 	const uint8_t ssid[IEEE80211_NWID_LEN])
 {
 	struct ieee80211_scan_req *sr;
 	int error;
 
 	KASSERT(vap->iv_opmode == IEEE80211_M_IBSS ||
 	    vap->iv_opmode == IEEE80211_M_AHDEMO,
 	    ("expected opmode IBSS or AHDEMO not %s",
 	    ieee80211_opmode_name[vap->iv_opmode]));
 
 	if (ssid_len == 0 || ssid_len > IEEE80211_NWID_LEN)
 		return EINVAL;
 
 	sr = IEEE80211_MALLOC(sizeof(*sr), M_TEMP,
 	     IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (sr == NULL)
 		return ENOMEM;
 
 	/* NB: IEEE80211_IOC_SSID call missing for ap_scan=2. */
 	memset(vap->iv_des_ssid[0].ssid, 0, IEEE80211_NWID_LEN);
 	vap->iv_des_ssid[0].len = ssid_len;
 	memcpy(vap->iv_des_ssid[0].ssid, ssid, ssid_len);
 	vap->iv_des_nssid = 1;
 
 	sr->sr_flags = IEEE80211_IOC_SCAN_ACTIVE | IEEE80211_IOC_SCAN_ONCE;
 	sr->sr_duration = IEEE80211_IOC_SCAN_FOREVER;
 	memcpy(sr->sr_ssid[0].ssid, ssid, ssid_len);
 	sr->sr_ssid[0].len = ssid_len;
 	sr->sr_nssid = 1;
 
 	error = ieee80211_scanreq(vap, sr);
 
 	IEEE80211_FREE(sr, M_TEMP);
 	return error;
 }
 
 static int
 ieee80211_ioctl_setmlme(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211req_mlme mlme;
 	int error;
 
 	if (ireq->i_len != sizeof(mlme))
 		return EINVAL;
 	error = copyin(ireq->i_data, &mlme, sizeof(mlme));
 	if (error)
 		return error;
 	if  (vap->iv_opmode == IEEE80211_M_STA &&
 	    mlme.im_op == IEEE80211_MLME_ASSOC)
 		return setmlme_assoc_sta(vap, mlme.im_macaddr,
 		    vap->iv_des_ssid[0].len, vap->iv_des_ssid[0].ssid);
 	else if ((vap->iv_opmode == IEEE80211_M_IBSS || 
 	    vap->iv_opmode == IEEE80211_M_AHDEMO) && 
 	    mlme.im_op == IEEE80211_MLME_ASSOC)
 		return setmlme_assoc_adhoc(vap, mlme.im_macaddr,
 		    mlme.im_ssid_len, mlme.im_ssid);
 	else
 		return setmlme_common(vap, mlme.im_op,
 		    mlme.im_macaddr, mlme.im_reason);
 }
 
 static int
 ieee80211_ioctl_macmac(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	uint8_t mac[IEEE80211_ADDR_LEN];
 	const struct ieee80211_aclator *acl = vap->iv_acl;
 	int error;
 
 	if (ireq->i_len != sizeof(mac))
 		return EINVAL;
 	error = copyin(ireq->i_data, mac, ireq->i_len);
 	if (error)
 		return error;
 	if (acl == NULL) {
 		acl = ieee80211_aclator_get("mac");
 		if (acl == NULL || !acl->iac_attach(vap))
 			return EINVAL;
 		vap->iv_acl = acl;
 	}
 	if (ireq->i_type == IEEE80211_IOC_ADDMAC)
 		acl->iac_add(vap, mac);
 	else
 		acl->iac_remove(vap, mac);
 	return 0;
 }
 
 static int
 ieee80211_ioctl_setmaccmd(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	const struct ieee80211_aclator *acl = vap->iv_acl;
 
 	switch (ireq->i_val) {
 	case IEEE80211_MACCMD_POLICY_OPEN:
 	case IEEE80211_MACCMD_POLICY_ALLOW:
 	case IEEE80211_MACCMD_POLICY_DENY:
 	case IEEE80211_MACCMD_POLICY_RADIUS:
 		if (acl == NULL) {
 			acl = ieee80211_aclator_get("mac");
 			if (acl == NULL || !acl->iac_attach(vap))
 				return EINVAL;
 			vap->iv_acl = acl;
 		}
 		acl->iac_setpolicy(vap, ireq->i_val);
 		break;
 	case IEEE80211_MACCMD_FLUSH:
 		if (acl != NULL)
 			acl->iac_flush(vap);
 		/* NB: silently ignore when not in use */
 		break;
 	case IEEE80211_MACCMD_DETACH:
 		if (acl != NULL) {
 			vap->iv_acl = NULL;
 			acl->iac_detach(vap);
 		}
 		break;
 	default:
 		if (acl == NULL)
 			return EINVAL;
 		else
 			return acl->iac_setioctl(vap, ireq);
 	}
 	return 0;
 }
 
 static int
 ieee80211_ioctl_setchanlist(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	uint8_t *chanlist, *list;
 	int i, nchan, maxchan, error;
 
 	if (ireq->i_len > sizeof(ic->ic_chan_active))
 		ireq->i_len = sizeof(ic->ic_chan_active);
 	list = IEEE80211_MALLOC(ireq->i_len + IEEE80211_CHAN_BYTES, M_TEMP,
 	    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (list == NULL)
 		return ENOMEM;
 	error = copyin(ireq->i_data, list, ireq->i_len);
 	if (error) {
 		IEEE80211_FREE(list, M_TEMP);
 		return error;
 	}
 	nchan = 0;
 	chanlist = list + ireq->i_len;		/* NB: zero'd already */
 	maxchan = ireq->i_len * NBBY;
 	for (i = 0; i < ic->ic_nchans; i++) {
 		const struct ieee80211_channel *c = &ic->ic_channels[i];
 		/*
 		 * Calculate the intersection of the user list and the
 		 * available channels so users can do things like specify
 		 * 1-255 to get all available channels.
 		 */
 		if (c->ic_ieee < maxchan && isset(list, c->ic_ieee)) {
 			setbit(chanlist, c->ic_ieee);
 			nchan++;
 		}
 	}
 	if (nchan == 0) {
 		IEEE80211_FREE(list, M_TEMP);
 		return EINVAL;
 	}
 	if (ic->ic_bsschan != IEEE80211_CHAN_ANYC &&	/* XXX */
 	    isclr(chanlist, ic->ic_bsschan->ic_ieee))
 		ic->ic_bsschan = IEEE80211_CHAN_ANYC;
 	memcpy(ic->ic_chan_active, chanlist, IEEE80211_CHAN_BYTES);
 	ieee80211_scan_flush(vap);
 	IEEE80211_FREE(list, M_TEMP);
 	return ENETRESET;
 }
 
 static int
 ieee80211_ioctl_setstastats(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_node *ni;
 	uint8_t macaddr[IEEE80211_ADDR_LEN];
 	int error;
 
 	/*
 	 * NB: we could copyin ieee80211req_sta_stats so apps
 	 *     could make selective changes but that's overkill;
 	 *     just clear all stats for now.
 	 */
 	if (ireq->i_len < IEEE80211_ADDR_LEN)
 		return EINVAL;
 	error = copyin(ireq->i_data, macaddr, IEEE80211_ADDR_LEN);
 	if (error != 0)
 		return error;
 	ni = ieee80211_find_vap_node(&vap->iv_ic->ic_sta, vap, macaddr);
 	if (ni == NULL)
 		return ENOENT;
 	/* XXX require ni_vap == vap? */
 	memset(&ni->ni_stats, 0, sizeof(ni->ni_stats));
 	ieee80211_free_node(ni);
 	return 0;
 }
 
 static int
 ieee80211_ioctl_setstatxpow(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_node *ni;
 	struct ieee80211req_sta_txpow txpow;
 	int error;
 
 	if (ireq->i_len != sizeof(txpow))
 		return EINVAL;
 	error = copyin(ireq->i_data, &txpow, sizeof(txpow));
 	if (error != 0)
 		return error;
 	ni = ieee80211_find_vap_node(&vap->iv_ic->ic_sta, vap, txpow.it_macaddr);
 	if (ni == NULL)
 		return ENOENT;
 	ni->ni_txpower = txpow.it_txpow;
 	ieee80211_free_node(ni);
 	return error;
 }
 
 static int
 ieee80211_ioctl_setwmeparam(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_wme_state *wme = &ic->ic_wme;
 	struct wmeParams *wmep, *chanp;
 	int isbss, ac, aggrmode;
 
 	if ((ic->ic_caps & IEEE80211_C_WME) == 0)
 		return EOPNOTSUPP;
 
 	isbss = (ireq->i_len & IEEE80211_WMEPARAM_BSS);
 	ac = (ireq->i_len & IEEE80211_WMEPARAM_VAL);
 	aggrmode = (wme->wme_flags & WME_F_AGGRMODE);
 	if (ac >= WME_NUM_AC)
 		ac = WME_AC_BE;
 	if (isbss) {
 		chanp = &wme->wme_bssChanParams.cap_wmeParams[ac];
 		wmep = &wme->wme_wmeBssChanParams.cap_wmeParams[ac];
 	} else {
 		chanp = &wme->wme_chanParams.cap_wmeParams[ac];
 		wmep = &wme->wme_wmeChanParams.cap_wmeParams[ac];
 	}
 	switch (ireq->i_type) {
 	case IEEE80211_IOC_WME_CWMIN:		/* WME: CWmin */
 		wmep->wmep_logcwmin = ireq->i_val;
 		if (!isbss || !aggrmode)
 			chanp->wmep_logcwmin = ireq->i_val;
 		break;
 	case IEEE80211_IOC_WME_CWMAX:		/* WME: CWmax */
 		wmep->wmep_logcwmax = ireq->i_val;
 		if (!isbss || !aggrmode)
 			chanp->wmep_logcwmax = ireq->i_val;
 		break;
 	case IEEE80211_IOC_WME_AIFS:		/* WME: AIFS */
 		wmep->wmep_aifsn = ireq->i_val;
 		if (!isbss || !aggrmode)
 			chanp->wmep_aifsn = ireq->i_val;
 		break;
 	case IEEE80211_IOC_WME_TXOPLIMIT:	/* WME: txops limit */
 		wmep->wmep_txopLimit = ireq->i_val;
 		if (!isbss || !aggrmode)
 			chanp->wmep_txopLimit = ireq->i_val;
 		break;
 	case IEEE80211_IOC_WME_ACM:		/* WME: ACM (bss only) */
 		wmep->wmep_acm = ireq->i_val;
 		if (!aggrmode)
 			chanp->wmep_acm = ireq->i_val;
 		break;
 	case IEEE80211_IOC_WME_ACKPOLICY:	/* WME: ACK policy (!bss only)*/
 		wmep->wmep_noackPolicy = chanp->wmep_noackPolicy =
 			(ireq->i_val) == 0;
 		break;
 	}
 	ieee80211_wme_updateparams(vap);
 	return 0;
 }
 
 static int
 find11gchannel(struct ieee80211com *ic, int start, int freq)
 {
 	const struct ieee80211_channel *c;
 	int i;
 
 	for (i = start+1; i < ic->ic_nchans; i++) {
 		c = &ic->ic_channels[i];
 		if (c->ic_freq == freq && IEEE80211_IS_CHAN_ANYG(c))
 			return 1;
 	}
 	/* NB: should not be needed but in case things are mis-sorted */
 	for (i = 0; i < start; i++) {
 		c = &ic->ic_channels[i];
 		if (c->ic_freq == freq && IEEE80211_IS_CHAN_ANYG(c))
 			return 1;
 	}
 	return 0;
 }
 
 static struct ieee80211_channel *
 findchannel(struct ieee80211com *ic, int ieee, int mode)
 {
 	static const u_int chanflags[IEEE80211_MODE_MAX] = {
 	    [IEEE80211_MODE_AUTO]	= 0,
 	    [IEEE80211_MODE_11A]	= IEEE80211_CHAN_A,
 	    [IEEE80211_MODE_11B]	= IEEE80211_CHAN_B,
 	    [IEEE80211_MODE_11G]	= IEEE80211_CHAN_G,
 	    [IEEE80211_MODE_FH]		= IEEE80211_CHAN_FHSS,
 	    [IEEE80211_MODE_TURBO_A]	= IEEE80211_CHAN_108A,
 	    [IEEE80211_MODE_TURBO_G]	= IEEE80211_CHAN_108G,
 	    [IEEE80211_MODE_STURBO_A]	= IEEE80211_CHAN_STURBO,
 	    [IEEE80211_MODE_HALF]	= IEEE80211_CHAN_HALF,
 	    [IEEE80211_MODE_QUARTER]	= IEEE80211_CHAN_QUARTER,
 	    /* NB: handled specially below */
 	    [IEEE80211_MODE_11NA]	= IEEE80211_CHAN_A,
 	    [IEEE80211_MODE_11NG]	= IEEE80211_CHAN_G,
 	    [IEEE80211_MODE_VHT_5GHZ]	= IEEE80211_CHAN_A,
 	    [IEEE80211_MODE_VHT_2GHZ]	= IEEE80211_CHAN_G,
 	};
 	u_int modeflags;
 	int i;
 
 	modeflags = chanflags[mode];
 	for (i = 0; i < ic->ic_nchans; i++) {
 		struct ieee80211_channel *c = &ic->ic_channels[i];
 
 		if (c->ic_ieee != ieee)
 			continue;
 		if (mode == IEEE80211_MODE_AUTO) {
 			/* ignore turbo channels for autoselect */
 			if (IEEE80211_IS_CHAN_TURBO(c))
 				continue;
 			/*
 			 * XXX special-case 11b/g channels so we
 			 *     always select the g channel if both
 			 *     are present.
 			 * XXX prefer HT to non-HT?
 			 */
 			if (!IEEE80211_IS_CHAN_B(c) ||
 			    !find11gchannel(ic, i, c->ic_freq))
 				return c;
 		} else {
 			/* must check VHT specifically */
 			if ((mode == IEEE80211_MODE_VHT_5GHZ ||
 			    mode == IEEE80211_MODE_VHT_2GHZ) &&
 			    !IEEE80211_IS_CHAN_VHT(c))
 				continue;
 
 			/*
 			 * Must check HT specially - only match on HT,
 			 * not HT+VHT channels
 			 */
 			if ((mode == IEEE80211_MODE_11NA ||
 			    mode == IEEE80211_MODE_11NG) &&
 			    !IEEE80211_IS_CHAN_HT(c))
 				continue;
 
 			if ((mode == IEEE80211_MODE_11NA ||
 			    mode == IEEE80211_MODE_11NG) &&
 			    IEEE80211_IS_CHAN_VHT(c))
 				continue;
 
 			/* Check that the modeflags above match */
 			if ((c->ic_flags & modeflags) == modeflags)
 				return c;
 		}
 	}
 	return NULL;
 }
 
 /*
  * Check the specified against any desired mode (aka netband).
  * This is only used (presently) when operating in hostap mode
  * to enforce consistency.
  */
 static int
 check_mode_consistency(const struct ieee80211_channel *c, int mode)
 {
 	KASSERT(c != IEEE80211_CHAN_ANYC, ("oops, no channel"));
 
 	switch (mode) {
 	case IEEE80211_MODE_11B:
 		return (IEEE80211_IS_CHAN_B(c));
 	case IEEE80211_MODE_11G:
 		return (IEEE80211_IS_CHAN_ANYG(c) && !IEEE80211_IS_CHAN_HT(c));
 	case IEEE80211_MODE_11A:
 		return (IEEE80211_IS_CHAN_A(c) && !IEEE80211_IS_CHAN_HT(c));
 	case IEEE80211_MODE_STURBO_A:
 		return (IEEE80211_IS_CHAN_STURBO(c));
 	case IEEE80211_MODE_11NA:
 		return (IEEE80211_IS_CHAN_HTA(c));
 	case IEEE80211_MODE_11NG:
 		return (IEEE80211_IS_CHAN_HTG(c));
 	}
 	return 1;
 
 }
 
 /*
  * Common code to set the current channel.  If the device
  * is up and running this may result in an immediate channel
  * change or a kick of the state machine.
  */
 static int
 setcurchan(struct ieee80211vap *vap, struct ieee80211_channel *c)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	int error;
 
 	if (c != IEEE80211_CHAN_ANYC) {
 		if (IEEE80211_IS_CHAN_RADAR(c))
 			return EBUSY;	/* XXX better code? */
 		if (vap->iv_opmode == IEEE80211_M_HOSTAP) {
 			if (IEEE80211_IS_CHAN_NOHOSTAP(c))
 				return EINVAL;
 			if (!check_mode_consistency(c, vap->iv_des_mode))
 				return EINVAL;
 		} else if (vap->iv_opmode == IEEE80211_M_IBSS) {
 			if (IEEE80211_IS_CHAN_NOADHOC(c))
 				return EINVAL;
 		}
 		if ((vap->iv_state == IEEE80211_S_RUN || vap->iv_state == IEEE80211_S_SLEEP) &&
 		    vap->iv_bss->ni_chan == c)
 			return 0;	/* NB: nothing to do */
 	}
 	vap->iv_des_chan = c;
 
 	error = 0;
 	if (vap->iv_opmode == IEEE80211_M_MONITOR &&
 	    vap->iv_des_chan != IEEE80211_CHAN_ANYC) {
 		/*
 		 * Monitor mode can switch directly.
 		 */
 		if (IFNET_IS_UP_RUNNING(vap->iv_ifp)) {
 			/* XXX need state machine for other vap's to follow */
 			ieee80211_setcurchan(ic, vap->iv_des_chan);
 			vap->iv_bss->ni_chan = ic->ic_curchan;
 		} else {
 			ic->ic_curchan = vap->iv_des_chan;
 			ic->ic_rt = ieee80211_get_ratetable(ic->ic_curchan);
 		}
 	} else {
 		/*
 		 * Need to go through the state machine in case we
 		 * need to reassociate or the like.  The state machine
 		 * will pickup the desired channel and avoid scanning.
 		 */
 		if (IS_UP_AUTO(vap))
 			ieee80211_new_state(vap, IEEE80211_S_SCAN, 0);
 		else if (vap->iv_des_chan != IEEE80211_CHAN_ANYC) {
 			/*
 			 * When not up+running and a real channel has
 			 * been specified fix the current channel so
 			 * there is immediate feedback; e.g. via ifconfig.
 			 */
 			ic->ic_curchan = vap->iv_des_chan;
 			ic->ic_rt = ieee80211_get_ratetable(ic->ic_curchan);
 		}
 	}
 	return error;
 }
 
 /*
  * Old api for setting the current channel; this is
  * deprecated because channel numbers are ambiguous.
  */
 static int
 ieee80211_ioctl_setchannel(struct ieee80211vap *vap,
 	const struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_channel *c;
 
 	/* XXX 0xffff overflows 16-bit signed */
 	if (ireq->i_val == 0 ||
 	    ireq->i_val == (int16_t) IEEE80211_CHAN_ANY) {
 		c = IEEE80211_CHAN_ANYC;
 	} else {
 		struct ieee80211_channel *c2;
 
 		c = findchannel(ic, ireq->i_val, vap->iv_des_mode);
 		if (c == NULL) {
 			c = findchannel(ic, ireq->i_val,
 				IEEE80211_MODE_AUTO);
 			if (c == NULL)
 				return EINVAL;
 		}
 
 		/*
 		 * Fine tune channel selection based on desired mode:
 		 *   if 11b is requested, find the 11b version of any
 		 *      11g channel returned,
 		 *   if static turbo, find the turbo version of any
 		 *	11a channel return,
 		 *   if 11na is requested, find the ht version of any
 		 *      11a channel returned,
 		 *   if 11ng is requested, find the ht version of any
 		 *      11g channel returned,
 		 *   if 11ac is requested, find the 11ac version
 		 *      of any 11a/11na channel returned,
 		 *   (TBD) 11acg (2GHz VHT)
 		 *   otherwise we should be ok with what we've got.
 		 */
 		switch (vap->iv_des_mode) {
 		case IEEE80211_MODE_11B:
 			if (IEEE80211_IS_CHAN_ANYG(c)) {
 				c2 = findchannel(ic, ireq->i_val,
 					IEEE80211_MODE_11B);
 				/* NB: should not happen, =>'s 11g w/o 11b */
 				if (c2 != NULL)
 					c = c2;
 			}
 			break;
 		case IEEE80211_MODE_TURBO_A:
 			if (IEEE80211_IS_CHAN_A(c)) {
 				c2 = findchannel(ic, ireq->i_val,
 					IEEE80211_MODE_TURBO_A);
 				if (c2 != NULL)
 					c = c2;
 			}
 			break;
 		case IEEE80211_MODE_11NA:
 			if (IEEE80211_IS_CHAN_A(c)) {
 				c2 = findchannel(ic, ireq->i_val,
 					IEEE80211_MODE_11NA);
 				if (c2 != NULL)
 					c = c2;
 			}
 			break;
 		case IEEE80211_MODE_11NG:
 			if (IEEE80211_IS_CHAN_ANYG(c)) {
 				c2 = findchannel(ic, ireq->i_val,
 					IEEE80211_MODE_11NG);
 				if (c2 != NULL)
 					c = c2;
 			}
 			break;
 		case IEEE80211_MODE_VHT_2GHZ:
 			printf("%s: TBD\n", __func__);
 			break;
 		case IEEE80211_MODE_VHT_5GHZ:
 			if (IEEE80211_IS_CHAN_A(c)) {
 				c2 = findchannel(ic, ireq->i_val,
 					IEEE80211_MODE_VHT_5GHZ);
 				if (c2 != NULL)
 					c = c2;
 			}
 			break;
 		default:		/* NB: no static turboG */
 			break;
 		}
 	}
 	return setcurchan(vap, c);
 }
 
 /*
  * New/current api for setting the current channel; a complete
  * channel description is provide so there is no ambiguity in
  * identifying the channel.
  */
 static int
 ieee80211_ioctl_setcurchan(struct ieee80211vap *vap,
 	const struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_channel chan, *c;
 	int error;
 
 	if (ireq->i_len != sizeof(chan))
 		return EINVAL;
 	error = copyin(ireq->i_data, &chan, sizeof(chan));
 	if (error != 0)
 		return error;
 
 	/* XXX 0xffff overflows 16-bit signed */
 	if (chan.ic_freq == 0 || chan.ic_freq == IEEE80211_CHAN_ANY) {
 		c = IEEE80211_CHAN_ANYC;
 	} else {
 		c = ieee80211_find_channel(ic, chan.ic_freq, chan.ic_flags);
 		if (c == NULL)
 			return EINVAL;
 	}
 	return setcurchan(vap, c);
 }
 
 static int
 ieee80211_ioctl_setregdomain(struct ieee80211vap *vap,
 	const struct ieee80211req *ireq)
 {
 	struct ieee80211_regdomain_req *reg;
 	int nchans, error;
 
 	nchans = 1 + ((ireq->i_len - sizeof(struct ieee80211_regdomain_req)) /
 	    sizeof(struct ieee80211_channel));
 	if (!(1 <= nchans && nchans <= IEEE80211_CHAN_MAX)) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_IOCTL,
 		    "%s: bad # chans, i_len %d nchans %d\n", __func__,
 		    ireq->i_len, nchans);
 		return EINVAL;
 	}
 	reg = (struct ieee80211_regdomain_req *)
 	    IEEE80211_MALLOC(IEEE80211_REGDOMAIN_SIZE(nchans), M_TEMP,
 	      IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (reg == NULL) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_IOCTL,
 		    "%s: no memory, nchans %d\n", __func__, nchans);
 		return ENOMEM;
 	}
 	error = copyin(ireq->i_data, reg, IEEE80211_REGDOMAIN_SIZE(nchans));
 	if (error == 0) {
 		/* NB: validate inline channel count against storage size */
 		if (reg->chaninfo.ic_nchans != nchans) {
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_IOCTL,
 			    "%s: chan cnt mismatch, %d != %d\n", __func__,
 				reg->chaninfo.ic_nchans, nchans);
 			error = EINVAL;
 		} else
 			error = ieee80211_setregdomain(vap, reg);
 	}
 	IEEE80211_FREE(reg, M_TEMP);
 
 	return (error == 0 ? ENETRESET : error);
 }
 
 static int
 checkrate(const struct ieee80211_rateset *rs, int rate)
 {
 	int i;
 
 	if (rate == IEEE80211_FIXED_RATE_NONE)
 		return 1;
 	for (i = 0; i < rs->rs_nrates; i++)
 		if ((rs->rs_rates[i] & IEEE80211_RATE_VAL) == rate)
 			return 1;
 	return 0;
 }
 
 static int
 checkmcs(const struct ieee80211_htrateset *rs, int mcs)
 {
 	int rate_val = IEEE80211_RV(mcs);
 	int i;
 
 	if (mcs == IEEE80211_FIXED_RATE_NONE)
 		return 1;
 	if ((mcs & IEEE80211_RATE_MCS) == 0)	/* MCS always have 0x80 set */
 		return 0;
 	for (i = 0; i < rs->rs_nrates; i++)
 		if (IEEE80211_RV(rs->rs_rates[i]) == rate_val)
 			return 1;
 	return 0;
 }
 
 static int
 ieee80211_ioctl_setroam(struct ieee80211vap *vap,
         const struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_roamparams_req *parms;
 	struct ieee80211_roamparam *src, *dst;
 	const struct ieee80211_htrateset *rs_ht;
 	const struct ieee80211_rateset *rs;
 	int changed, error, mode, is11n, nmodes;
 
 	if (ireq->i_len != sizeof(vap->iv_roamparms))
 		return EINVAL;
 
 	parms = IEEE80211_MALLOC(sizeof(*parms), M_TEMP,
 	    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (parms == NULL)
 		return ENOMEM;
 
 	error = copyin(ireq->i_data, parms, ireq->i_len);
 	if (error != 0)
 		goto fail;
 
 	changed = 0;
 	nmodes = IEEE80211_MODE_MAX;
 
 	/* validate parameters and check if anything changed */
 	for (mode = IEEE80211_MODE_11A; mode < nmodes; mode++) {
 		if (isclr(ic->ic_modecaps, mode))
 			continue;
 		src = &parms->params[mode];
 		dst = &vap->iv_roamparms[mode];
 		rs = &ic->ic_sup_rates[mode];	/* NB: 11n maps to legacy */
 		rs_ht = &ic->ic_sup_htrates;
 		is11n = (mode == IEEE80211_MODE_11NA ||
 			 mode == IEEE80211_MODE_11NG);
 		/* XXX TODO: 11ac */
 		if (src->rate != dst->rate) {
 			if (!checkrate(rs, src->rate) &&
 			    (!is11n || !checkmcs(rs_ht, src->rate))) {
 				error = EINVAL;
 				goto fail;
 			}
 			changed++;
 		}
 		if (src->rssi != dst->rssi)
 			changed++;
 	}
 	if (changed) {
 		/*
 		 * Copy new parameters in place and notify the
 		 * driver so it can push state to the device.
 		 */
 		/* XXX locking? */
 		for (mode = IEEE80211_MODE_11A; mode < nmodes; mode++) {
 			if (isset(ic->ic_modecaps, mode))
 				vap->iv_roamparms[mode] = parms->params[mode];
 		}
 
 		if (vap->iv_roaming == IEEE80211_ROAMING_DEVICE)
 			error = ERESTART;
 	}
 
 fail:	IEEE80211_FREE(parms, M_TEMP);
 	return error;
 }
 
 static int
 ieee80211_ioctl_settxparams(struct ieee80211vap *vap,
 	const struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_txparams_req parms;	/* XXX stack use? */
 	struct ieee80211_txparam *src, *dst;
 	const struct ieee80211_htrateset *rs_ht;
 	const struct ieee80211_rateset *rs;
 	int error, mode, changed, is11n, nmodes;
 
 	/* NB: accept short requests for backwards compat */
 	if (ireq->i_len > sizeof(parms))
 		return EINVAL;
 	error = copyin(ireq->i_data, &parms, ireq->i_len);
 	if (error != 0)
 		return error;
 	nmodes = ireq->i_len / sizeof(struct ieee80211_txparam);
 	changed = 0;
 	/* validate parameters and check if anything changed */
 	for (mode = IEEE80211_MODE_11A; mode < nmodes; mode++) {
 		if (isclr(ic->ic_modecaps, mode))
 			continue;
 		src = &parms.params[mode];
 		dst = &vap->iv_txparms[mode];
 		rs = &ic->ic_sup_rates[mode];	/* NB: 11n maps to legacy */
 		rs_ht = &ic->ic_sup_htrates;
 		is11n = (mode == IEEE80211_MODE_11NA ||
 			 mode == IEEE80211_MODE_11NG);
 		if (src->ucastrate != dst->ucastrate) {
 			if (!checkrate(rs, src->ucastrate) &&
 			    (!is11n || !checkmcs(rs_ht, src->ucastrate)))
 				return EINVAL;
 			changed++;
 		}
 		if (src->mcastrate != dst->mcastrate) {
 			if (!checkrate(rs, src->mcastrate) &&
 			    (!is11n || !checkmcs(rs_ht, src->mcastrate)))
 				return EINVAL;
 			changed++;
 		}
 		if (src->mgmtrate != dst->mgmtrate) {
 			if (!checkrate(rs, src->mgmtrate) &&
 			    (!is11n || !checkmcs(rs_ht, src->mgmtrate)))
 				return EINVAL;
 			changed++;
 		}
 		if (src->maxretry != dst->maxretry)	/* NB: no bounds */
 			changed++;
 	}
 	if (changed) {
 		/*
 		 * Copy new parameters in place and notify the
 		 * driver so it can push state to the device.
 		 */
 		for (mode = IEEE80211_MODE_11A; mode < nmodes; mode++) {
 			if (isset(ic->ic_modecaps, mode))
 				vap->iv_txparms[mode] = parms.params[mode];
 		}
 		/* XXX could be more intelligent,
 		   e.g. don't reset if setting not being used */
 		return ENETRESET;
 	}
 	return 0;
 }
 
 /*
  * Application Information Element support.
  */
 static int
 setappie(struct ieee80211_appie **aie, const struct ieee80211req *ireq)
 {
 	struct ieee80211_appie *app = *aie;
 	struct ieee80211_appie *napp;
 	int error;
 
 	if (ireq->i_len == 0) {		/* delete any existing ie */
 		if (app != NULL) {
 			*aie = NULL;	/* XXX racey */
 			IEEE80211_FREE(app, M_80211_NODE_IE);
 		}
 		return 0;
 	}
 	if (!(2 <= ireq->i_len && ireq->i_len <= IEEE80211_MAX_APPIE))
 		return EINVAL;
 	/*
 	 * Allocate a new appie structure and copy in the user data.
 	 * When done swap in the new structure.  Note that we do not
 	 * guard against users holding a ref to the old structure;
 	 * this must be handled outside this code.
 	 *
 	 * XXX bad bad bad
 	 */
 	napp = (struct ieee80211_appie *) IEEE80211_MALLOC(
 	    sizeof(struct ieee80211_appie) + ireq->i_len, M_80211_NODE_IE,
 	    IEEE80211_M_NOWAIT);
 	if (napp == NULL)
 		return ENOMEM;
 	/* XXX holding ic lock */
 	error = copyin(ireq->i_data, napp->ie_data, ireq->i_len);
 	if (error) {
 		IEEE80211_FREE(napp, M_80211_NODE_IE);
 		return error;
 	}
 	napp->ie_len = ireq->i_len;
 	*aie = napp;
 	if (app != NULL)
 		IEEE80211_FREE(app, M_80211_NODE_IE);
 	return 0;
 }
 
 static void
 setwparsnie(struct ieee80211vap *vap, uint8_t *ie, int space)
 {
 	/* validate data is present as best we can */
 	if (space == 0 || 2+ie[1] > space)
 		return;
 	if (ie[0] == IEEE80211_ELEMID_VENDOR)
 		vap->iv_wpa_ie = ie;
 	else if (ie[0] == IEEE80211_ELEMID_RSN)
 		vap->iv_rsn_ie = ie;
 }
 
 static int
 ieee80211_ioctl_setappie_locked(struct ieee80211vap *vap,
 	const struct ieee80211req *ireq, int fc0)
 {
 	int error;
 
 	IEEE80211_LOCK_ASSERT(vap->iv_ic);
 
 	switch (fc0 & IEEE80211_FC0_SUBTYPE_MASK) {
 	case IEEE80211_FC0_SUBTYPE_BEACON:
 		if (vap->iv_opmode != IEEE80211_M_HOSTAP &&
 		    vap->iv_opmode != IEEE80211_M_IBSS) {
 			error = EINVAL;
 			break;
 		}
 		error = setappie(&vap->iv_appie_beacon, ireq);
 		if (error == 0)
 			ieee80211_beacon_notify(vap, IEEE80211_BEACON_APPIE);
 		break;
 	case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 		error = setappie(&vap->iv_appie_proberesp, ireq);
 		break;
 	case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 		if (vap->iv_opmode == IEEE80211_M_HOSTAP)
 			error = setappie(&vap->iv_appie_assocresp, ireq);
 		else
 			error = EINVAL;
 		break;
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 		error = setappie(&vap->iv_appie_probereq, ireq);
 		break;
 	case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 		if (vap->iv_opmode == IEEE80211_M_STA)
 			error = setappie(&vap->iv_appie_assocreq, ireq);
 		else
 			error = EINVAL;
 		break;
 	case (IEEE80211_APPIE_WPA & IEEE80211_FC0_SUBTYPE_MASK):
 		error = setappie(&vap->iv_appie_wpa, ireq);
 		if (error == 0) {
 			/*
 			 * Must split single blob of data into separate
 			 * WPA and RSN ie's because they go in different
 			 * locations in the mgt frames.
 			 * XXX use IEEE80211_IOC_WPA2 so user code does split
 			 */
 			vap->iv_wpa_ie = NULL;
 			vap->iv_rsn_ie = NULL;
 			if (vap->iv_appie_wpa != NULL) {
 				struct ieee80211_appie *appie =
 				    vap->iv_appie_wpa;
 				uint8_t *data = appie->ie_data;
 
 				/* XXX ie length validate is painful, cheat */
 				setwparsnie(vap, data, appie->ie_len);
 				setwparsnie(vap, data + 2 + data[1],
 				    appie->ie_len - (2 + data[1]));
 			}
 			if (vap->iv_opmode == IEEE80211_M_HOSTAP ||
 			    vap->iv_opmode == IEEE80211_M_IBSS) {
 				/*
 				 * Must rebuild beacon frame as the update
 				 * mechanism doesn't handle WPA/RSN ie's.
 				 * Could extend it but it doesn't normally
 				 * change; this is just to deal with hostapd
 				 * plumbing the ie after the interface is up.
 				 */
 				error = ENETRESET;
 			}
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return error;
 }
 
 static int
 ieee80211_ioctl_setappie(struct ieee80211vap *vap,
 	const struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	int error;
 	uint8_t fc0;
 
 	fc0 = ireq->i_val & 0xff;
 	if ((fc0 & IEEE80211_FC0_TYPE_MASK) != IEEE80211_FC0_TYPE_MGT)
 		return EINVAL;
 	/* NB: could check iv_opmode and reject but hardly worth the effort */
 	IEEE80211_LOCK(ic);
 	error = ieee80211_ioctl_setappie_locked(vap, ireq, fc0);
 	IEEE80211_UNLOCK(ic);
 	return error;
 }
 
 static int
 ieee80211_ioctl_chanswitch(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_chanswitch_req csr;
 	struct ieee80211_channel *c;
 	int error;
 
 	if (ireq->i_len != sizeof(csr))
 		return EINVAL;
 	error = copyin(ireq->i_data, &csr, sizeof(csr));
 	if (error != 0)
 		return error;
 	/* XXX adhoc mode not supported */
 	if (vap->iv_opmode != IEEE80211_M_HOSTAP ||
 	    (vap->iv_flags & IEEE80211_F_DOTH) == 0)
 		return EOPNOTSUPP;
 	c = ieee80211_find_channel(ic,
 	    csr.csa_chan.ic_freq, csr.csa_chan.ic_flags);
 	if (c == NULL)
 		return ENOENT;
 	IEEE80211_LOCK(ic);
 	if ((ic->ic_flags & IEEE80211_F_CSAPENDING) == 0)
 		ieee80211_csa_startswitch(ic, c, csr.csa_mode, csr.csa_count);
 	else if (csr.csa_count == 0)
 		ieee80211_csa_cancelswitch(ic);
 	else
 		error = EBUSY;
 	IEEE80211_UNLOCK(ic);
 	return error;
 }
 
 static int
 ieee80211_scanreq(struct ieee80211vap *vap, struct ieee80211_scan_req *sr)
 {
 #define	IEEE80211_IOC_SCAN_FLAGS \
 	(IEEE80211_IOC_SCAN_NOPICK | IEEE80211_IOC_SCAN_ACTIVE | \
 	 IEEE80211_IOC_SCAN_PICK1ST | IEEE80211_IOC_SCAN_BGSCAN | \
 	 IEEE80211_IOC_SCAN_ONCE | IEEE80211_IOC_SCAN_NOBCAST | \
 	 IEEE80211_IOC_SCAN_NOJOIN | IEEE80211_IOC_SCAN_FLUSH | \
 	 IEEE80211_IOC_SCAN_CHECK)
 	struct ieee80211com *ic = vap->iv_ic;
 	int error, i;
 
 	/* convert duration */
 	if (sr->sr_duration == IEEE80211_IOC_SCAN_FOREVER)
 		sr->sr_duration = IEEE80211_SCAN_FOREVER;
 	else {
 		if (sr->sr_duration < IEEE80211_IOC_SCAN_DURATION_MIN ||
 		    sr->sr_duration > IEEE80211_IOC_SCAN_DURATION_MAX)
 			return EINVAL;
 		sr->sr_duration = msecs_to_ticks(sr->sr_duration);
 	}
 	/* convert min/max channel dwell */
 	if (sr->sr_mindwell != 0)
 		sr->sr_mindwell = msecs_to_ticks(sr->sr_mindwell);
 	if (sr->sr_maxdwell != 0)
 		sr->sr_maxdwell = msecs_to_ticks(sr->sr_maxdwell);
 	/* NB: silently reduce ssid count to what is supported */
 	if (sr->sr_nssid > IEEE80211_SCAN_MAX_SSID)
 		sr->sr_nssid = IEEE80211_SCAN_MAX_SSID;
 	for (i = 0; i < sr->sr_nssid; i++)
 		if (sr->sr_ssid[i].len > IEEE80211_NWID_LEN)
 			return EINVAL;
 	/* cleanse flags just in case, could reject if invalid flags */
 	sr->sr_flags &= IEEE80211_IOC_SCAN_FLAGS;
 	/*
 	 * Add an implicit NOPICK if the vap is not marked UP.  This
 	 * allows applications to scan without joining a bss (or picking
 	 * a channel and setting up a bss) and without forcing manual
 	 * roaming mode--you just need to mark the parent device UP.
 	 */
 	if ((vap->iv_ifp->if_flags & IFF_UP) == 0)
 		sr->sr_flags |= IEEE80211_IOC_SCAN_NOPICK;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 	    "%s: flags 0x%x%s duration 0x%x mindwell %u maxdwell %u nssid %d\n",
 	    __func__, sr->sr_flags,
 	    (vap->iv_ifp->if_flags & IFF_UP) == 0 ? " (!IFF_UP)" : "",
 	    sr->sr_duration, sr->sr_mindwell, sr->sr_maxdwell, sr->sr_nssid);
 	/*
 	 * If we are in INIT state then the driver has never had a chance
 	 * to setup hardware state to do a scan; we must use the state
 	 * machine to get us up to the SCAN state but once we reach SCAN
 	 * state we then want to use the supplied params.  Stash the
 	 * parameters in the vap and mark IEEE80211_FEXT_SCANREQ; the
 	 * state machines will recognize this and use the stashed params
 	 * to issue the scan request.
 	 *
 	 * Otherwise just invoke the scan machinery directly.
 	 */
 	IEEE80211_LOCK(ic);
 	if (ic->ic_nrunning == 0) {
 		IEEE80211_UNLOCK(ic);
 		return ENXIO;
 	}
 
 	if (vap->iv_state == IEEE80211_S_INIT) {
 		/* NB: clobbers previous settings */
 		vap->iv_scanreq_flags = sr->sr_flags;
 		vap->iv_scanreq_duration = sr->sr_duration;
 		vap->iv_scanreq_nssid = sr->sr_nssid;
 		for (i = 0; i < sr->sr_nssid; i++) {
 			vap->iv_scanreq_ssid[i].len = sr->sr_ssid[i].len;
 			memcpy(vap->iv_scanreq_ssid[i].ssid,
 			    sr->sr_ssid[i].ssid, sr->sr_ssid[i].len);
 		}
 		vap->iv_flags_ext |= IEEE80211_FEXT_SCANREQ;
 		IEEE80211_UNLOCK(ic);
 		ieee80211_new_state(vap, IEEE80211_S_SCAN, 0);
 	} else {
 		vap->iv_flags_ext &= ~IEEE80211_FEXT_SCANREQ;
 		IEEE80211_UNLOCK(ic);
 		if (sr->sr_flags & IEEE80211_IOC_SCAN_CHECK) {
 			error = ieee80211_check_scan(vap, sr->sr_flags,
 			    sr->sr_duration, sr->sr_mindwell, sr->sr_maxdwell,
 			    sr->sr_nssid,
 			    /* NB: cheat, we assume structures are compatible */
 			    (const struct ieee80211_scan_ssid *) &sr->sr_ssid[0]);
 		} else {
 			error = ieee80211_start_scan(vap, sr->sr_flags,
 			    sr->sr_duration, sr->sr_mindwell, sr->sr_maxdwell,
 			    sr->sr_nssid,
 			    /* NB: cheat, we assume structures are compatible */
 			    (const struct ieee80211_scan_ssid *) &sr->sr_ssid[0]);
 		}
 		if (error == 0)
 			return EINPROGRESS;
 	}
 	return 0;
 #undef IEEE80211_IOC_SCAN_FLAGS
 }
 
 static int
 ieee80211_ioctl_scanreq(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_scan_req *sr;
 	int error;
 
 	if (ireq->i_len != sizeof(*sr))
 		return EINVAL;
 	sr = IEEE80211_MALLOC(sizeof(*sr), M_TEMP,
 	     IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (sr == NULL)
 		return ENOMEM;
 	error = copyin(ireq->i_data, sr, sizeof(*sr));
 	if (error != 0)
 		goto bad;
 	error = ieee80211_scanreq(vap, sr);
 bad:
 	IEEE80211_FREE(sr, M_TEMP);
 	return error;
 }
 
 static int
 ieee80211_ioctl_setstavlan(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_node *ni;
 	struct ieee80211req_sta_vlan vlan;
 	int error;
 
 	if (ireq->i_len != sizeof(vlan))
 		return EINVAL;
 	error = copyin(ireq->i_data, &vlan, sizeof(vlan));
 	if (error != 0)
 		return error;
 	if (!IEEE80211_ADDR_EQ(vlan.sv_macaddr, zerobssid)) {
 		ni = ieee80211_find_vap_node(&vap->iv_ic->ic_sta, vap,
 		    vlan.sv_macaddr);
 		if (ni == NULL)
 			return ENOENT;
 	} else
 		ni = ieee80211_ref_node(vap->iv_bss);
 	ni->ni_vlan = vlan.sv_vlan;
 	ieee80211_free_node(ni);
 	return error;
 }
 
 static int
 isvap11g(const struct ieee80211vap *vap)
 {
 	const struct ieee80211_node *bss = vap->iv_bss;
 	return bss->ni_chan != IEEE80211_CHAN_ANYC &&
 	    IEEE80211_IS_CHAN_ANYG(bss->ni_chan);
 }
 
 static int
 isvapht(const struct ieee80211vap *vap)
 {
 	const struct ieee80211_node *bss = vap->iv_bss;
 	return bss->ni_chan != IEEE80211_CHAN_ANYC &&
 	    IEEE80211_IS_CHAN_HT(bss->ni_chan);
 }
 
 /*
  * Dummy ioctl set handler so the linker set is defined.
  */
 static int
 dummy_ioctl_set(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	return ENOSYS;
 }
 IEEE80211_IOCTL_SET(dummy, dummy_ioctl_set);
 
 static int
 ieee80211_ioctl_setdefault(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	ieee80211_ioctl_setfunc * const *set;
 	int error;
 
 	SET_FOREACH(set, ieee80211_ioctl_setset) {
 		error = (*set)(vap, ireq);
 		if (error != ENOSYS)
 			return error;
 	}
 	return EINVAL;
 }
 
 static int
 ieee80211_ioctl_set80211(struct ieee80211vap *vap, u_long cmd, struct ieee80211req *ireq)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	int error;
 	const struct ieee80211_authenticator *auth;
 	uint8_t tmpkey[IEEE80211_KEYBUF_SIZE];
 	char tmpssid[IEEE80211_NWID_LEN];
 	uint8_t tmpbssid[IEEE80211_ADDR_LEN];
 	struct ieee80211_key *k;
 	u_int kid;
 	uint32_t flags;
 
 	error = 0;
 	switch (ireq->i_type) {
 	case IEEE80211_IOC_SSID:
 		if (ireq->i_val != 0 ||
 		    ireq->i_len > IEEE80211_NWID_LEN)
 			return EINVAL;
 		error = copyin(ireq->i_data, tmpssid, ireq->i_len);
 		if (error)
 			break;
 		memset(vap->iv_des_ssid[0].ssid, 0, IEEE80211_NWID_LEN);
 		vap->iv_des_ssid[0].len = ireq->i_len;
 		memcpy(vap->iv_des_ssid[0].ssid, tmpssid, ireq->i_len);
 		vap->iv_des_nssid = (ireq->i_len > 0);
 		error = ENETRESET;
 		break;
 	case IEEE80211_IOC_WEP:
 		switch (ireq->i_val) {
 		case IEEE80211_WEP_OFF:
 			vap->iv_flags &= ~IEEE80211_F_PRIVACY;
 			vap->iv_flags &= ~IEEE80211_F_DROPUNENC;
 			break;
 		case IEEE80211_WEP_ON:
 			vap->iv_flags |= IEEE80211_F_PRIVACY;
 			vap->iv_flags |= IEEE80211_F_DROPUNENC;
 			break;
 		case IEEE80211_WEP_MIXED:
 			vap->iv_flags |= IEEE80211_F_PRIVACY;
 			vap->iv_flags &= ~IEEE80211_F_DROPUNENC;
 			break;
 		}
 		error = ENETRESET;
 		break;
 	case IEEE80211_IOC_WEPKEY:
 		kid = (u_int) ireq->i_val;
 		if (kid >= IEEE80211_WEP_NKID)
 			return EINVAL;
 		k = &vap->iv_nw_keys[kid];
 		if (ireq->i_len == 0) {
 			/* zero-len =>'s delete any existing key */
 			(void) ieee80211_crypto_delkey(vap, k);
 			break;
 		}
 		if (ireq->i_len > sizeof(tmpkey))
 			return EINVAL;
 		memset(tmpkey, 0, sizeof(tmpkey));
 		error = copyin(ireq->i_data, tmpkey, ireq->i_len);
 		if (error)
 			break;
 		ieee80211_key_update_begin(vap);
 		k->wk_keyix = kid;	/* NB: force fixed key id */
 		if (ieee80211_crypto_newkey(vap, IEEE80211_CIPHER_WEP,
 		    IEEE80211_KEY_XMIT | IEEE80211_KEY_RECV, k)) {
 			k->wk_keylen = ireq->i_len;
 			memcpy(k->wk_key, tmpkey, sizeof(tmpkey));
 			IEEE80211_ADDR_COPY(k->wk_macaddr, vap->iv_myaddr);
 			if  (!ieee80211_crypto_setkey(vap, k))
 				error = EINVAL;
 		} else
 			error = EINVAL;
 		ieee80211_key_update_end(vap);
 		break;
 	case IEEE80211_IOC_WEPTXKEY:
 		kid = (u_int) ireq->i_val;
 		if (kid >= IEEE80211_WEP_NKID &&
 		    (uint16_t) kid != IEEE80211_KEYIX_NONE)
 			return EINVAL;
 		/*
 		 * Firmware devices may need to be told about an explicit
 		 * key index here, versus just inferring it from the
 		 * key set / change.  Since we may also need to pause
 		 * things like transmit before the key is updated,
 		 * give the driver a chance to flush things by tying
 		 * into key update begin/end.
 		 */
 		ieee80211_key_update_begin(vap);
 		ieee80211_crypto_set_deftxkey(vap, kid);
 		ieee80211_key_update_end(vap);
 		break;
 	case IEEE80211_IOC_AUTHMODE:
 		switch (ireq->i_val) {
 		case IEEE80211_AUTH_WPA:
 		case IEEE80211_AUTH_8021X:	/* 802.1x */
 		case IEEE80211_AUTH_OPEN:	/* open */
 		case IEEE80211_AUTH_SHARED:	/* shared-key */
 		case IEEE80211_AUTH_AUTO:	/* auto */
 			auth = ieee80211_authenticator_get(ireq->i_val);
 			if (auth == NULL)
 				return EINVAL;
 			break;
 		default:
 			return EINVAL;
 		}
 		switch (ireq->i_val) {
 		case IEEE80211_AUTH_WPA:	/* WPA w/ 802.1x */
 			vap->iv_flags |= IEEE80211_F_PRIVACY;
 			ireq->i_val = IEEE80211_AUTH_8021X;
 			break;
 		case IEEE80211_AUTH_OPEN:	/* open */
 			vap->iv_flags &= ~(IEEE80211_F_WPA|IEEE80211_F_PRIVACY);
 			break;
 		case IEEE80211_AUTH_SHARED:	/* shared-key */
 		case IEEE80211_AUTH_8021X:	/* 802.1x */
 			vap->iv_flags &= ~IEEE80211_F_WPA;
 			/* both require a key so mark the PRIVACY capability */
 			vap->iv_flags |= IEEE80211_F_PRIVACY;
 			break;
 		case IEEE80211_AUTH_AUTO:	/* auto */
 			vap->iv_flags &= ~IEEE80211_F_WPA;
 			/* XXX PRIVACY handling? */
 			/* XXX what's the right way to do this? */
 			break;
 		}
 		/* NB: authenticator attach/detach happens on state change */
 		vap->iv_bss->ni_authmode = ireq->i_val;
 		/* XXX mixed/mode/usage? */
 		vap->iv_auth = auth;
 		error = ENETRESET;
 		break;
 	case IEEE80211_IOC_CHANNEL:
 		error = ieee80211_ioctl_setchannel(vap, ireq);
 		break;
 	case IEEE80211_IOC_POWERSAVE:
 		switch (ireq->i_val) {
 		case IEEE80211_POWERSAVE_OFF:
 			if (vap->iv_flags & IEEE80211_F_PMGTON) {
 				ieee80211_syncflag(vap, -IEEE80211_F_PMGTON);
 				error = ERESTART;
 			}
 			break;
 		case IEEE80211_POWERSAVE_ON:
 			if ((vap->iv_caps & IEEE80211_C_PMGT) == 0)
 				error = EOPNOTSUPP;
 			else if ((vap->iv_flags & IEEE80211_F_PMGTON) == 0) {
 				ieee80211_syncflag(vap, IEEE80211_F_PMGTON);
 				error = ERESTART;
 			}
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	case IEEE80211_IOC_POWERSAVESLEEP:
 		if (ireq->i_val < 0)
 			return EINVAL;
 		ic->ic_lintval = ireq->i_val;
 		error = ERESTART;
 		break;
 	case IEEE80211_IOC_RTSTHRESHOLD:
 		if (!(IEEE80211_RTS_MIN <= ireq->i_val &&
 		      ireq->i_val <= IEEE80211_RTS_MAX))
 			return EINVAL;
 		vap->iv_rtsthreshold = ireq->i_val;
 		error = ERESTART;
 		break;
 	case IEEE80211_IOC_PROTMODE:
 		if (ireq->i_val > IEEE80211_PROT_RTSCTS)
 			return EINVAL;
 		vap->iv_protmode = (enum ieee80211_protmode)ireq->i_val;
 		/* NB: if not operating in 11g this can wait */
 		if (ic->ic_bsschan != IEEE80211_CHAN_ANYC &&
 		    IEEE80211_IS_CHAN_ANYG(ic->ic_bsschan))
 			error = ERESTART;
 		/* driver callback for protection mode update */
 		ieee80211_vap_update_erp_protmode(vap);
 		break;
 	case IEEE80211_IOC_TXPOWER:
 		if ((ic->ic_caps & IEEE80211_C_TXPMGT) == 0)
 			return EOPNOTSUPP;
 		if (!(IEEE80211_TXPOWER_MIN <= ireq->i_val &&
 		      ireq->i_val <= IEEE80211_TXPOWER_MAX))
 			return EINVAL;
 		ic->ic_txpowlimit = ireq->i_val;
 		error = ERESTART;
 		break;
 	case IEEE80211_IOC_ROAMING:
 		if (!(IEEE80211_ROAMING_DEVICE <= ireq->i_val &&
 		    ireq->i_val <= IEEE80211_ROAMING_MANUAL))
 			return EINVAL;
 		vap->iv_roaming = (enum ieee80211_roamingmode)ireq->i_val;
 		/* XXXX reset? */
 		break;
 	case IEEE80211_IOC_PRIVACY:
 		if (ireq->i_val) {
 			/* XXX check for key state? */
 			vap->iv_flags |= IEEE80211_F_PRIVACY;
 		} else
 			vap->iv_flags &= ~IEEE80211_F_PRIVACY;
 		/* XXX ERESTART? */
 		break;
 	case IEEE80211_IOC_DROPUNENCRYPTED:
 		if (ireq->i_val)
 			vap->iv_flags |= IEEE80211_F_DROPUNENC;
 		else
 			vap->iv_flags &= ~IEEE80211_F_DROPUNENC;
 		/* XXX ERESTART? */
 		break;
 	case IEEE80211_IOC_WPAKEY:
 		error = ieee80211_ioctl_setkey(vap, ireq);
 		break;
 	case IEEE80211_IOC_DELKEY:
 		error = ieee80211_ioctl_delkey(vap, ireq);
 		break;
 	case IEEE80211_IOC_MLME:
 		error = ieee80211_ioctl_setmlme(vap, ireq);
 		break;
 	case IEEE80211_IOC_COUNTERMEASURES:
 		if (ireq->i_val) {
 			if ((vap->iv_flags & IEEE80211_F_WPA) == 0)
 				return EOPNOTSUPP;
 			vap->iv_flags |= IEEE80211_F_COUNTERM;
 		} else
 			vap->iv_flags &= ~IEEE80211_F_COUNTERM;
 		/* XXX ERESTART? */
 		break;
 	case IEEE80211_IOC_WPA:
 		if (ireq->i_val > 3)
 			return EINVAL;
 		/* XXX verify ciphers available */
 		flags = vap->iv_flags & ~IEEE80211_F_WPA;
 		switch (ireq->i_val) {
 		case 0:
 			/* wpa_supplicant calls this to clear the WPA config */
 			break;
 		case 1:
 			if (!(vap->iv_caps & IEEE80211_C_WPA1))
 				return EOPNOTSUPP;
 			flags |= IEEE80211_F_WPA1;
 			break;
 		case 2:
 			if (!(vap->iv_caps & IEEE80211_C_WPA2))
 				return EOPNOTSUPP;
 			flags |= IEEE80211_F_WPA2;
 			break;
 		case 3:
 			if ((vap->iv_caps & IEEE80211_C_WPA) != IEEE80211_C_WPA)
 				return EOPNOTSUPP;
 			flags |= IEEE80211_F_WPA1 | IEEE80211_F_WPA2;
 			break;
 		default:	/*  Can't set any -> error */
 			return EOPNOTSUPP;
 		}
 		vap->iv_flags = flags;
 		error = ERESTART;	/* NB: can change beacon frame */
 		break;
 	case IEEE80211_IOC_WME:
 		if (ireq->i_val) {
 			if ((vap->iv_caps & IEEE80211_C_WME) == 0)
 				return EOPNOTSUPP;
 			ieee80211_syncflag(vap, IEEE80211_F_WME);
 		} else
 			ieee80211_syncflag(vap, -IEEE80211_F_WME);
 		error = ERESTART;	/* NB: can change beacon frame */
 		break;
 	case IEEE80211_IOC_HIDESSID:
 		if (ireq->i_val)
 			vap->iv_flags |= IEEE80211_F_HIDESSID;
 		else
 			vap->iv_flags &= ~IEEE80211_F_HIDESSID;
 		error = ERESTART;		/* XXX ENETRESET? */
 		break;
 	case IEEE80211_IOC_APBRIDGE:
 		if (ireq->i_val == 0)
 			vap->iv_flags |= IEEE80211_F_NOBRIDGE;
 		else
 			vap->iv_flags &= ~IEEE80211_F_NOBRIDGE;
 		break;
 	case IEEE80211_IOC_BSSID:
 		if (ireq->i_len != sizeof(tmpbssid))
 			return EINVAL;
 		error = copyin(ireq->i_data, tmpbssid, ireq->i_len);
 		if (error)
 			break;
 		IEEE80211_ADDR_COPY(vap->iv_des_bssid, tmpbssid);
 		if (IEEE80211_ADDR_EQ(vap->iv_des_bssid, zerobssid))
 			vap->iv_flags &= ~IEEE80211_F_DESBSSID;
 		else
 			vap->iv_flags |= IEEE80211_F_DESBSSID;
 		error = ENETRESET;
 		break;
 	case IEEE80211_IOC_CHANLIST:
 		error = ieee80211_ioctl_setchanlist(vap, ireq);
 		break;
 #define	OLD_IEEE80211_IOC_SCAN_REQ	23
 #ifdef OLD_IEEE80211_IOC_SCAN_REQ
 	case OLD_IEEE80211_IOC_SCAN_REQ:
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 			"%s: active scan request\n", __func__);
 		/*
 		 * If we are in INIT state then the driver has never
 		 * had a chance to setup hardware state to do a scan;
 		 * use the state machine to get us up the SCAN state.
 		 * Otherwise just invoke the scan machinery to start
 		 * a one-time scan.
 		 */
 		if (vap->iv_state == IEEE80211_S_INIT)
 			ieee80211_new_state(vap, IEEE80211_S_SCAN, 0);
 		else
 			(void) ieee80211_start_scan(vap,
 				IEEE80211_SCAN_ACTIVE |
 				IEEE80211_SCAN_NOPICK |
 				IEEE80211_SCAN_ONCE,
 				IEEE80211_SCAN_FOREVER, 0, 0,
 				/* XXX use ioctl params */
 				vap->iv_des_nssid, vap->iv_des_ssid);
 		break;
 #endif /* OLD_IEEE80211_IOC_SCAN_REQ */
 	case IEEE80211_IOC_SCAN_REQ:
 		error = ieee80211_ioctl_scanreq(vap, ireq);
 		break;
 	case IEEE80211_IOC_SCAN_CANCEL:
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: cancel scan\n", __func__);
 		ieee80211_cancel_scan(vap);
 		break;
 	case IEEE80211_IOC_HTCONF:
 		if (ireq->i_val & 1)
 			ieee80211_syncflag_ht(vap, IEEE80211_FHT_HT);
 		else
 			ieee80211_syncflag_ht(vap, -IEEE80211_FHT_HT);
 		if (ireq->i_val & 2)
 			ieee80211_syncflag_ht(vap, IEEE80211_FHT_USEHT40);
 		else
 			ieee80211_syncflag_ht(vap, -IEEE80211_FHT_USEHT40);
 		error = ENETRESET;
 		break;
 	case IEEE80211_IOC_ADDMAC:
 	case IEEE80211_IOC_DELMAC:
 		error = ieee80211_ioctl_macmac(vap, ireq);
 		break;
 	case IEEE80211_IOC_MACCMD:
 		error = ieee80211_ioctl_setmaccmd(vap, ireq);
 		break;
 	case IEEE80211_IOC_STA_STATS:
 		error = ieee80211_ioctl_setstastats(vap, ireq);
 		break;
 	case IEEE80211_IOC_STA_TXPOW:
 		error = ieee80211_ioctl_setstatxpow(vap, ireq);
 		break;
 	case IEEE80211_IOC_WME_CWMIN:		/* WME: CWmin */
 	case IEEE80211_IOC_WME_CWMAX:		/* WME: CWmax */
 	case IEEE80211_IOC_WME_AIFS:		/* WME: AIFS */
 	case IEEE80211_IOC_WME_TXOPLIMIT:	/* WME: txops limit */
 	case IEEE80211_IOC_WME_ACM:		/* WME: ACM (bss only) */
 	case IEEE80211_IOC_WME_ACKPOLICY:	/* WME: ACK policy (!bss only) */
 		error = ieee80211_ioctl_setwmeparam(vap, ireq);
 		break;
 	case IEEE80211_IOC_DTIM_PERIOD:
 		if (vap->iv_opmode != IEEE80211_M_HOSTAP &&
 		    vap->iv_opmode != IEEE80211_M_MBSS &&
 		    vap->iv_opmode != IEEE80211_M_IBSS)
 			return EINVAL;
 		if (IEEE80211_DTIM_MIN <= ireq->i_val &&
 		    ireq->i_val <= IEEE80211_DTIM_MAX) {
 			vap->iv_dtim_period = ireq->i_val;
 			error = ENETRESET;		/* requires restart */
 		} else
 			error = EINVAL;
 		break;
 	case IEEE80211_IOC_BEACON_INTERVAL:
 		if (vap->iv_opmode != IEEE80211_M_HOSTAP &&
 		    vap->iv_opmode != IEEE80211_M_MBSS &&
 		    vap->iv_opmode != IEEE80211_M_IBSS)
 			return EINVAL;
 		if (IEEE80211_BINTVAL_MIN <= ireq->i_val &&
 		    ireq->i_val <= IEEE80211_BINTVAL_MAX) {
 			ic->ic_bintval = ireq->i_val;
 			error = ENETRESET;		/* requires restart */
 		} else
 			error = EINVAL;
 		break;
 	case IEEE80211_IOC_PUREG:
 		if (ireq->i_val)
 			vap->iv_flags |= IEEE80211_F_PUREG;
 		else
 			vap->iv_flags &= ~IEEE80211_F_PUREG;
 		/* NB: reset only if we're operating on an 11g channel */
 		if (isvap11g(vap))
 			error = ENETRESET;
 		break;
 	case IEEE80211_IOC_QUIET:
 		vap->iv_quiet= ireq->i_val;
 		break;
 	case IEEE80211_IOC_QUIET_COUNT:
 		vap->iv_quiet_count=ireq->i_val;
 		break;
 	case IEEE80211_IOC_QUIET_PERIOD:
 		vap->iv_quiet_period=ireq->i_val;
 		break;
 	case IEEE80211_IOC_QUIET_OFFSET:
 		vap->iv_quiet_offset=ireq->i_val;
 		break;
 	case IEEE80211_IOC_QUIET_DUR:
 		if(ireq->i_val < vap->iv_bss->ni_intval)
 			vap->iv_quiet_duration = ireq->i_val;
 		else
 			error = EINVAL;
 		break;
 	case IEEE80211_IOC_BGSCAN:
 		if (ireq->i_val) {
 			if ((vap->iv_caps & IEEE80211_C_BGSCAN) == 0)
 				return EOPNOTSUPP;
 			vap->iv_flags |= IEEE80211_F_BGSCAN;
 		} else
 			vap->iv_flags &= ~IEEE80211_F_BGSCAN;
 		break;
 	case IEEE80211_IOC_BGSCAN_IDLE:
 		if (ireq->i_val >= IEEE80211_BGSCAN_IDLE_MIN)
 			vap->iv_bgscanidle = ireq->i_val*hz/1000;
 		else
 			error = EINVAL;
 		break;
 	case IEEE80211_IOC_BGSCAN_INTERVAL:
 		if (ireq->i_val >= IEEE80211_BGSCAN_INTVAL_MIN)
 			vap->iv_bgscanintvl = ireq->i_val*hz;
 		else
 			error = EINVAL;
 		break;
 	case IEEE80211_IOC_SCANVALID:
 		if (ireq->i_val >= IEEE80211_SCAN_VALID_MIN)
 			vap->iv_scanvalid = ireq->i_val*hz;
 		else
 			error = EINVAL;
 		break;
 	case IEEE80211_IOC_FRAGTHRESHOLD:
 		if ((vap->iv_caps & IEEE80211_C_TXFRAG) == 0 &&
 		    ireq->i_val != IEEE80211_FRAG_MAX)
 			return EOPNOTSUPP;
 		if (!(IEEE80211_FRAG_MIN <= ireq->i_val &&
 		      ireq->i_val <= IEEE80211_FRAG_MAX))
 			return EINVAL;
 		vap->iv_fragthreshold = ireq->i_val;
 		error = ERESTART;
 		break;
 	case IEEE80211_IOC_BURST:
 		if (ireq->i_val) {
 			if ((vap->iv_caps & IEEE80211_C_BURST) == 0)
 				return EOPNOTSUPP;
 			ieee80211_syncflag(vap, IEEE80211_F_BURST);
 		} else
 			ieee80211_syncflag(vap, -IEEE80211_F_BURST);
 		error = ERESTART;
 		break;
 	case IEEE80211_IOC_BMISSTHRESHOLD:
 		if (!(IEEE80211_HWBMISS_MIN <= ireq->i_val &&
 		      ireq->i_val <= IEEE80211_HWBMISS_MAX))
 			return EINVAL;
 		vap->iv_bmissthreshold = ireq->i_val;
 		error = ERESTART;
 		break;
 	case IEEE80211_IOC_CURCHAN:
 		error = ieee80211_ioctl_setcurchan(vap, ireq);
 		break;
 	case IEEE80211_IOC_SHORTGI:
 		if (ireq->i_val) {
 #define	IEEE80211_HTCAP_SHORTGI \
 	(IEEE80211_HTCAP_SHORTGI20 | IEEE80211_HTCAP_SHORTGI40)
 			if (((ireq->i_val ^ vap->iv_htcaps) & IEEE80211_HTCAP_SHORTGI) != 0)
 				return EINVAL;
 			if (ireq->i_val & IEEE80211_HTCAP_SHORTGI20)
 				vap->iv_flags_ht |= IEEE80211_FHT_SHORTGI20;
 			if (ireq->i_val & IEEE80211_HTCAP_SHORTGI40)
 				vap->iv_flags_ht |= IEEE80211_FHT_SHORTGI40;
 #undef IEEE80211_HTCAP_SHORTGI
 		} else
 			vap->iv_flags_ht &=
 			    ~(IEEE80211_FHT_SHORTGI20 | IEEE80211_FHT_SHORTGI40);
 		error = ERESTART;
 		break;
 	case IEEE80211_IOC_AMPDU:
 		if (ireq->i_val && (vap->iv_htcaps & IEEE80211_HTC_AMPDU) == 0)
 			return EINVAL;
 		if (ireq->i_val & 1)
 			vap->iv_flags_ht |= IEEE80211_FHT_AMPDU_TX;
 		else
 			vap->iv_flags_ht &= ~IEEE80211_FHT_AMPDU_TX;
 		if (ireq->i_val & 2)
 			vap->iv_flags_ht |= IEEE80211_FHT_AMPDU_RX;
 		else
 			vap->iv_flags_ht &= ~IEEE80211_FHT_AMPDU_RX;
 		/* NB: reset only if we're operating on an 11n channel */
 		if (isvapht(vap))
 			error = ERESTART;
 		break;
 	case IEEE80211_IOC_AMPDU_LIMIT:
 		/* XXX TODO: figure out ampdu_limit versus ampdu_rxmax */
 		if (!(IEEE80211_HTCAP_MAXRXAMPDU_8K <= ireq->i_val &&
 		      ireq->i_val <= IEEE80211_HTCAP_MAXRXAMPDU_64K))
 			return EINVAL;
 		if (vap->iv_opmode == IEEE80211_M_HOSTAP)
 			vap->iv_ampdu_rxmax = ireq->i_val;
 		else
 			vap->iv_ampdu_limit = ireq->i_val;
 		error = ERESTART;
 		break;
 	case IEEE80211_IOC_AMPDU_DENSITY:
 		if (!(IEEE80211_HTCAP_MPDUDENSITY_NA <= ireq->i_val &&
 		      ireq->i_val <= IEEE80211_HTCAP_MPDUDENSITY_16))
 			return EINVAL;
 		vap->iv_ampdu_density = ireq->i_val;
 		error = ERESTART;
 		break;
 	case IEEE80211_IOC_AMSDU:
 		if (ireq->i_val && (vap->iv_htcaps & IEEE80211_HTC_AMSDU) == 0)
 			return EINVAL;
 		if (ireq->i_val & 1)
 			vap->iv_flags_ht |= IEEE80211_FHT_AMSDU_TX;
 		else
 			vap->iv_flags_ht &= ~IEEE80211_FHT_AMSDU_TX;
 		if (ireq->i_val & 2)
 			vap->iv_flags_ht |= IEEE80211_FHT_AMSDU_RX;
 		else
 			vap->iv_flags_ht &= ~IEEE80211_FHT_AMSDU_RX;
 		/* NB: reset only if we're operating on an 11n channel */
 		if (isvapht(vap))
 			error = ERESTART;
 		break;
 	case IEEE80211_IOC_AMSDU_LIMIT:
 		/* XXX validate */
 		vap->iv_amsdu_limit = ireq->i_val;	/* XXX truncation? */
 		break;
 	case IEEE80211_IOC_PUREN:
 		if (ireq->i_val) {
 			if ((vap->iv_flags_ht & IEEE80211_FHT_HT) == 0)
 				return EINVAL;
 			vap->iv_flags_ht |= IEEE80211_FHT_PUREN;
 		} else
 			vap->iv_flags_ht &= ~IEEE80211_FHT_PUREN;
 		/* NB: reset only if we're operating on an 11n channel */
 		if (isvapht(vap))
 			error = ERESTART;
 		break;
 	case IEEE80211_IOC_DOTH:
 		if (ireq->i_val) {
 #if 0
 			/* XXX no capability */
 			if ((vap->iv_caps & IEEE80211_C_DOTH) == 0)
 				return EOPNOTSUPP;
 #endif
 			vap->iv_flags |= IEEE80211_F_DOTH;
 		} else
 			vap->iv_flags &= ~IEEE80211_F_DOTH;
 		error = ENETRESET;
 		break;
 	case IEEE80211_IOC_REGDOMAIN:
 		error = ieee80211_ioctl_setregdomain(vap, ireq);
 		break;
 	case IEEE80211_IOC_ROAM:
 		error = ieee80211_ioctl_setroam(vap, ireq);
 		break;
 	case IEEE80211_IOC_TXPARAMS:
 		error = ieee80211_ioctl_settxparams(vap, ireq);
 		break;
 	case IEEE80211_IOC_HTCOMPAT:
 		if (ireq->i_val) {
 			if ((vap->iv_flags_ht & IEEE80211_FHT_HT) == 0)
 				return EOPNOTSUPP;
 			vap->iv_flags_ht |= IEEE80211_FHT_HTCOMPAT;
 		} else
 			vap->iv_flags_ht &= ~IEEE80211_FHT_HTCOMPAT;
 		/* NB: reset only if we're operating on an 11n channel */
 		if (isvapht(vap))
 			error = ERESTART;
 		break;
 	case IEEE80211_IOC_DWDS:
 		if (ireq->i_val) {
 			/* NB: DWDS only makes sense for WDS-capable devices */
 			if ((ic->ic_caps & IEEE80211_C_WDS) == 0)
 				return EOPNOTSUPP;
 			/* NB: DWDS is used only with ap+sta vaps */
 			if (vap->iv_opmode != IEEE80211_M_HOSTAP &&
 			    vap->iv_opmode != IEEE80211_M_STA)
 				return EINVAL;
 			vap->iv_flags |= IEEE80211_F_DWDS;
 			if (vap->iv_opmode == IEEE80211_M_STA)
 				vap->iv_flags_ext |= IEEE80211_FEXT_4ADDR;
 		} else {
 			vap->iv_flags &= ~IEEE80211_F_DWDS;
 			if (vap->iv_opmode == IEEE80211_M_STA)
 				vap->iv_flags_ext &= ~IEEE80211_FEXT_4ADDR;
 		}
 		break;
 	case IEEE80211_IOC_INACTIVITY:
 		if (ireq->i_val)
 			vap->iv_flags_ext |= IEEE80211_FEXT_INACT;
 		else
 			vap->iv_flags_ext &= ~IEEE80211_FEXT_INACT;
 		break;
 	case IEEE80211_IOC_APPIE:
 		error = ieee80211_ioctl_setappie(vap, ireq);
 		break;
 	case IEEE80211_IOC_WPS:
 		if (ireq->i_val) {
 			if ((vap->iv_caps & IEEE80211_C_WPA) == 0)
 				return EOPNOTSUPP;
 			vap->iv_flags_ext |= IEEE80211_FEXT_WPS;
 		} else
 			vap->iv_flags_ext &= ~IEEE80211_FEXT_WPS;
 		break;
 	case IEEE80211_IOC_TSN:
 		if (ireq->i_val) {
 			if ((vap->iv_caps & IEEE80211_C_WPA) == 0)
 				return EOPNOTSUPP;
 			vap->iv_flags_ext |= IEEE80211_FEXT_TSN;
 		} else
 			vap->iv_flags_ext &= ~IEEE80211_FEXT_TSN;
 		break;
 	case IEEE80211_IOC_CHANSWITCH:
 		error = ieee80211_ioctl_chanswitch(vap, ireq);
 		break;
 	case IEEE80211_IOC_DFS:
 		if (ireq->i_val) {
 			if ((vap->iv_caps & IEEE80211_C_DFS) == 0)
 				return EOPNOTSUPP;
 			/* NB: DFS requires 11h support */
 			if ((vap->iv_flags & IEEE80211_F_DOTH) == 0)
 				return EINVAL;
 			vap->iv_flags_ext |= IEEE80211_FEXT_DFS;
 		} else
 			vap->iv_flags_ext &= ~IEEE80211_FEXT_DFS;
 		break;
 	case IEEE80211_IOC_DOTD:
 		if (ireq->i_val)
 			vap->iv_flags_ext |= IEEE80211_FEXT_DOTD;
 		else
 			vap->iv_flags_ext &= ~IEEE80211_FEXT_DOTD;
 		if (vap->iv_opmode == IEEE80211_M_STA)
 			error = ENETRESET;
 		break;
 	case IEEE80211_IOC_HTPROTMODE:
 		if (ireq->i_val > IEEE80211_PROT_RTSCTS)
 			return EINVAL;
 		vap->iv_htprotmode = ireq->i_val ?
 		    IEEE80211_PROT_RTSCTS : IEEE80211_PROT_NONE;
 		/* NB: if not operating in 11n this can wait */
 		if (isvapht(vap))
 			error = ERESTART;
 		/* Notify driver layer of HT protmode changes */
 		ieee80211_vap_update_ht_protmode(vap);
 		break;
 	case IEEE80211_IOC_STA_VLAN:
 		error = ieee80211_ioctl_setstavlan(vap, ireq);
 		break;
 	case IEEE80211_IOC_SMPS:
 		if ((ireq->i_val &~ IEEE80211_HTCAP_SMPS) != 0 ||
 		    ireq->i_val == 0x0008)	/* value of 2 is reserved */
 			return EINVAL;
 		if (ireq->i_val != IEEE80211_HTCAP_SMPS_OFF &&
 		    (vap->iv_htcaps & IEEE80211_HTC_SMPS) == 0)
 			return EOPNOTSUPP;
 		vap->iv_htcaps = (vap->iv_htcaps &~ IEEE80211_HTCAP_SMPS) |
 			ireq->i_val;
 		/* NB: if not operating in 11n this can wait */
 		if (isvapht(vap))
 			error = ERESTART;
 		break;
 	case IEEE80211_IOC_RIFS:
 		if (ireq->i_val != 0) {
 			if ((vap->iv_htcaps & IEEE80211_HTC_RIFS) == 0)
 				return EOPNOTSUPP;
 			vap->iv_flags_ht |= IEEE80211_FHT_RIFS;
 		} else
 			vap->iv_flags_ht &= ~IEEE80211_FHT_RIFS;
 		/* NB: if not operating in 11n this can wait */
 		if (isvapht(vap))
 			error = ERESTART;
 		break;
 	case IEEE80211_IOC_STBC:
 		/* Check if we can do STBC TX/RX before changing the setting */
 		if ((ireq->i_val & 1) &&
 		    ((vap->iv_htcaps & IEEE80211_HTCAP_TXSTBC) == 0))
 			return EOPNOTSUPP;
 		if ((ireq->i_val & 2) &&
 		    ((vap->iv_htcaps & IEEE80211_HTCAP_RXSTBC) == 0))
 			return EOPNOTSUPP;
 
 		/* TX */
 		if (ireq->i_val & 1)
 			vap->iv_flags_ht |= IEEE80211_FHT_STBC_TX;
 		else
 			vap->iv_flags_ht &= ~IEEE80211_FHT_STBC_TX;
 
 		/* RX */
 		if (ireq->i_val & 2)
 			vap->iv_flags_ht |= IEEE80211_FHT_STBC_RX;
 		else
 			vap->iv_flags_ht &= ~IEEE80211_FHT_STBC_RX;
 
 		/* NB: reset only if we're operating on an 11n channel */
 		if (isvapht(vap))
 			error = ERESTART;
 		break;
 	case IEEE80211_IOC_LDPC:
 		/* Check if we can do LDPC TX/RX before changing the setting */
 		if ((ireq->i_val & 1) &&
 		    (vap->iv_htcaps & IEEE80211_HTC_TXLDPC) == 0)
 			return EOPNOTSUPP;
 		if ((ireq->i_val & 2) &&
 		    (vap->iv_htcaps & IEEE80211_HTCAP_LDPC) == 0)
 			return EOPNOTSUPP;
 
 		/* TX */
 		if (ireq->i_val & 1)
 			vap->iv_flags_ht |= IEEE80211_FHT_LDPC_TX;
 		else
 			vap->iv_flags_ht &= ~IEEE80211_FHT_LDPC_TX;
 
 		/* RX */
 		if (ireq->i_val & 2)
 			vap->iv_flags_ht |= IEEE80211_FHT_LDPC_RX;
 		else
 			vap->iv_flags_ht &= ~IEEE80211_FHT_LDPC_RX;
 
 		/* NB: reset only if we're operating on an 11n channel */
 		if (isvapht(vap))
 			error = ERESTART;
 		break;
 	case IEEE80211_IOC_UAPSD:
 		if ((vap->iv_caps & IEEE80211_C_UAPSD) == 0)
 			return EOPNOTSUPP;
 		if (ireq->i_val == 0)
 			vap->iv_flags_ext &= ~IEEE80211_FEXT_UAPSD;
 		else if (ireq->i_val == 1)
 			vap->iv_flags_ext |= IEEE80211_FEXT_UAPSD;
 		else
 			return EINVAL;
 		break;
 
 	/* VHT */
 	case IEEE80211_IOC_VHTCONF:
 		if (ireq->i_val & IEEE80211_FVHT_VHT)
 			ieee80211_syncflag_vht(vap, IEEE80211_FVHT_VHT);
 		else
 			ieee80211_syncflag_vht(vap, -IEEE80211_FVHT_VHT);
 
 		if (ireq->i_val & IEEE80211_FVHT_USEVHT40)
 			ieee80211_syncflag_vht(vap, IEEE80211_FVHT_USEVHT40);
 		else
 			ieee80211_syncflag_vht(vap, -IEEE80211_FVHT_USEVHT40);
 
 		if (ireq->i_val & IEEE80211_FVHT_USEVHT80)
 			ieee80211_syncflag_vht(vap, IEEE80211_FVHT_USEVHT80);
 		else
 			ieee80211_syncflag_vht(vap, -IEEE80211_FVHT_USEVHT80);
 
 		if (ireq->i_val & IEEE80211_FVHT_USEVHT160)
 			ieee80211_syncflag_vht(vap, IEEE80211_FVHT_USEVHT160);
 		else
 			ieee80211_syncflag_vht(vap, -IEEE80211_FVHT_USEVHT160);
 
 		if (ireq->i_val & IEEE80211_FVHT_USEVHT80P80)
 			ieee80211_syncflag_vht(vap, IEEE80211_FVHT_USEVHT80P80);
 		else
 			ieee80211_syncflag_vht(vap, -IEEE80211_FVHT_USEVHT80P80);
 
 		error = ENETRESET;
 		break;
 
 	default:
 		error = ieee80211_ioctl_setdefault(vap, ireq);
 		break;
 	}
 	/*
 	 * The convention is that ENETRESET means an operation
 	 * requires a complete re-initialization of the device (e.g.
 	 * changing something that affects the association state).
 	 * ERESTART means the request may be handled with only a
 	 * reload of the hardware state.  We hand ERESTART requests
 	 * to the iv_reset callback so the driver can decide.  If
 	 * a device does not fillin iv_reset then it defaults to one
 	 * that returns ENETRESET.  Otherwise a driver may return
 	 * ENETRESET (in which case a full reset will be done) or
 	 * 0 to mean there's no need to do anything (e.g. when the
 	 * change has no effect on the driver/device).
 	 */
 	if (error == ERESTART)
 		error = IFNET_IS_UP_RUNNING(vap->iv_ifp) ?
 		    vap->iv_reset(vap, ireq->i_type) : 0;
 	if (error == ENETRESET) {
 		/* XXX need to re-think AUTO handling */
 		if (IS_UP_AUTO(vap))
 			ieee80211_init(vap);
 		error = 0;
 	}
 	return error;
 }
 
 int
 ieee80211_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ieee80211vap *vap = ifp->if_softc;
 	struct ieee80211com *ic = vap->iv_ic;
 	int error = 0, wait = 0, ic_used;
 	struct ifreq *ifr;
 	struct ifaddr *ifa;			/* XXX */
 
 	ic_used = (cmd != SIOCSIFMTU && cmd != SIOCG80211STATS);
 	if (ic_used && (error = ieee80211_com_vincref(vap)) != 0)
 		return (error);
 
 	switch (cmd) {
 	case SIOCSIFFLAGS:
 		IEEE80211_LOCK(ic);
 		if ((ifp->if_flags ^ vap->iv_ifflags) & IFF_PROMISC) {
 			/*
 			 * Enable promiscuous mode when:
 			 * 1. Interface is not a member of bridge, or
 			 * 2. Requested by user, or
 			 * 3. In monitor (or adhoc-demo) mode.
 			 */
 			if (ifp->if_bridge == NULL ||
 			    (ifp->if_flags & IFF_PPROMISC) != 0 ||
 			    vap->iv_opmode == IEEE80211_M_MONITOR ||
 			    (vap->iv_opmode == IEEE80211_M_AHDEMO &&
 			    (vap->iv_caps & IEEE80211_C_TDMA) == 0)) {
 				ieee80211_promisc(vap,
 				    ifp->if_flags & IFF_PROMISC);
 				vap->iv_ifflags ^= IFF_PROMISC;
 			}
 		}
 		if ((ifp->if_flags ^ vap->iv_ifflags) & IFF_ALLMULTI) {
 			ieee80211_allmulti(vap, ifp->if_flags & IFF_ALLMULTI);
 			vap->iv_ifflags ^= IFF_ALLMULTI;
 		}
 		if (ifp->if_flags & IFF_UP) {
 			/*
 			 * Bring ourself up unless we're already operational.
 			 * If we're the first vap and the parent is not up
 			 * then it will automatically be brought up as a
 			 * side-effect of bringing ourself up.
 			 */
 			if (vap->iv_state == IEEE80211_S_INIT) {
 				if (ic->ic_nrunning == 0)
 					wait = 1;
 				ieee80211_start_locked(vap);
 			}
 		} else if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 			/*
 			 * Stop ourself.  If we are the last vap to be
 			 * marked down the parent will also be taken down.
 			 */
 			if (ic->ic_nrunning == 1)
 				wait = 1;
 			ieee80211_stop_locked(vap);
 		}
 		IEEE80211_UNLOCK(ic);
 		/* Wait for parent ioctl handler if it was queued */
 		if (wait) {
 			struct epoch_tracker et;
 
 			ieee80211_waitfor_parent(ic);
 
 			/*
 			 * Check if the MAC address was changed
 			 * via SIOCSIFLLADDR ioctl.
 			 *
 			 * NB: device may be detached during initialization;
 			 * use if_ioctl for existence check.
 			 */
 			NET_EPOCH_ENTER(et);
 			if (ifp->if_ioctl == ieee80211_ioctl &&
 			    (ifp->if_flags & IFF_UP) == 0 &&
 			    !IEEE80211_ADDR_EQ(vap->iv_myaddr, IF_LLADDR(ifp)))
 				IEEE80211_ADDR_COPY(vap->iv_myaddr,
 				    IF_LLADDR(ifp));
 			NET_EPOCH_EXIT(et);
 		}
 		break;
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		ieee80211_runtask(ic, &ic->ic_mcast_task);
 		break;
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		ifr = (struct ifreq *)data;
 		error = ifmedia_ioctl(ifp, ifr, &vap->iv_media, cmd);
 		break;
 	case SIOCG80211:
 		error = ieee80211_ioctl_get80211(vap, cmd,
 				(struct ieee80211req *) data);
 		break;
 	case SIOCS80211:
 		error = ieee80211_priv_check_vap_manage(cmd, vap, ifp);
 		if (error == 0)
 			error = ieee80211_ioctl_set80211(vap, cmd,
 					(struct ieee80211req *) data);
 		break;
 	case SIOCG80211STATS:
 		ifr = (struct ifreq *)data;
 		copyout(&vap->iv_stats, ifr_data_get_ptr(ifr),
 		    sizeof (vap->iv_stats));
 		break;
 	case SIOCSIFMTU:
 		ifr = (struct ifreq *)data;
 		if (!(IEEE80211_MTU_MIN <= ifr->ifr_mtu &&
 		    ifr->ifr_mtu <= IEEE80211_MTU_MAX))
 			error = EINVAL;
 		else
 			ifp->if_mtu = ifr->ifr_mtu;
 		break;
 	case SIOCSIFADDR:
 		/*
 		 * XXX Handle this directly so we can suppress if_init calls.
 		 * XXX This should be done in ether_ioctl but for the moment
 		 * XXX there are too many other parts of the system that
 		 * XXX set IFF_UP and so suppress if_init being called when
 		 * XXX it should be.
 		 */
 		ifa = (struct ifaddr *) data;
 		switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			if ((ifp->if_flags & IFF_UP) == 0) {
 				ifp->if_flags |= IFF_UP;
 				ifp->if_init(ifp->if_softc);
 			}
 			arp_ifinit(ifp, ifa);
 			break;
 #endif
 		default:
 			if ((ifp->if_flags & IFF_UP) == 0) {
 				ifp->if_flags |= IFF_UP;
 				ifp->if_init(ifp->if_softc);
 			}
 			break;
 		}
 		break;
 	case SIOCSIFLLADDR:
 		error = ieee80211_priv_check_vap_setmac(cmd, vap, ifp);
 		if (error == 0)
 			break;
 		/* Fallthrough */
 	default:
 		/*
 		 * Pass unknown ioctls first to the driver, and if it
 		 * returns ENOTTY, then to the generic Ethernet handler.
 		 */
 		if (ic->ic_ioctl != NULL &&
 		    (error = ic->ic_ioctl(ic, cmd, data)) != ENOTTY)
 			break;
 		error = ether_ioctl(ifp, cmd, data);
 		break;
 	}
 
 	if (ic_used)
 		ieee80211_com_vdecref(vap);
 
 	return (error);
 }
diff --git a/sys/net80211/ieee80211_mesh.c b/sys/net80211/ieee80211_mesh.c
index f4f0d79b3ac3..100a0a385f84 100644
--- a/sys/net80211/ieee80211_mesh.c
+++ b/sys/net80211/ieee80211_mesh.c
@@ -1,3616 +1,3617 @@
 /*- 
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009 The FreeBSD Foundation 
  * 
  * This software was developed by Rui Paulo under sponsorship from the
  * FreeBSD Foundation. 
  *  
  * Redistribution and use in source and binary forms, with or without 
  * modification, are permitted provided that the following conditions 
  * are met: 
  * 1. Redistributions of source code must retain the above copyright 
  *    notice, this list of conditions and the following disclaimer. 
  * 2. Redistributions in binary form must reproduce the above copyright 
  *    notice, this list of conditions and the following disclaimer in the 
  *    documentation and/or other materials provided with the distribution. 
  * 
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
  * SUCH DAMAGE. 
  */ 
 #include <sys/cdefs.h>
 #ifdef __FreeBSD__
 __FBSDID("$FreeBSD$");
 #endif
 
 /*
  * IEEE 802.11s Mesh Point (MBSS) support.
  *
  * Based on March 2009, D3.0 802.11s draft spec.
  */
 #include "opt_inet.h"
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/mbuf.h>   
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_llc.h>
+#include <net/if_private.h>
 #include <net/ethernet.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_action.h>
 #ifdef IEEE80211_SUPPORT_SUPERG
 #include <net80211/ieee80211_superg.h>
 #endif
 #include <net80211/ieee80211_input.h>
 #include <net80211/ieee80211_mesh.h>
 
 static void	mesh_rt_flush_invalid(struct ieee80211vap *);
 static int	mesh_select_proto_path(struct ieee80211vap *, const char *);
 static int	mesh_select_proto_metric(struct ieee80211vap *, const char *);
 static void	mesh_vattach(struct ieee80211vap *);
 static int	mesh_newstate(struct ieee80211vap *, enum ieee80211_state, int);
 static void	mesh_rt_cleanup_cb(void *);
 static void	mesh_gatemode_setup(struct ieee80211vap *);
 static void	mesh_gatemode_cb(void *);
 static void	mesh_linkchange(struct ieee80211_node *,
 		    enum ieee80211_mesh_mlstate);
 static void	mesh_checkid(void *, struct ieee80211_node *);
 static uint32_t	mesh_generateid(struct ieee80211vap *);
 static int	mesh_checkpseq(struct ieee80211vap *,
 		    const uint8_t [IEEE80211_ADDR_LEN], uint32_t);
 static void	mesh_transmit_to_gate(struct ieee80211vap *, struct mbuf *,
 		    struct ieee80211_mesh_route *);
 static void	mesh_forward(struct ieee80211vap *, struct mbuf *,
 		    const struct ieee80211_meshcntl *);
 static int	mesh_input(struct ieee80211_node *, struct mbuf *,
 		    const struct ieee80211_rx_stats *rxs, int, int);
 static void	mesh_recv_mgmt(struct ieee80211_node *, struct mbuf *, int,
 		    const struct ieee80211_rx_stats *rxs, int, int);
 static void	mesh_recv_ctl(struct ieee80211_node *, struct mbuf *, int);
 static void	mesh_peer_timeout_setup(struct ieee80211_node *);
 static void	mesh_peer_timeout_backoff(struct ieee80211_node *);
 static void	mesh_peer_timeout_cb(void *);
 static __inline void
 		mesh_peer_timeout_stop(struct ieee80211_node *);
 static int	mesh_verify_meshid(struct ieee80211vap *, const uint8_t *);
 static int	mesh_verify_meshconf(struct ieee80211vap *, const uint8_t *);
 static int	mesh_verify_meshpeer(struct ieee80211vap *, uint8_t,
     		    const uint8_t *);
 uint32_t	mesh_airtime_calc(struct ieee80211_node *);
 
 /*
  * Timeout values come from the specification and are in milliseconds.
  */
 static SYSCTL_NODE(_net_wlan, OID_AUTO, mesh, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "IEEE 802.11s parameters");
 static int	ieee80211_mesh_gateint = -1;
 SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, gateint,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &ieee80211_mesh_gateint, 0, ieee80211_sysctl_msecs_ticks, "I",
     "mesh gate interval (ms)");
 static int ieee80211_mesh_retrytimeout = -1;
 SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, retrytimeout,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &ieee80211_mesh_retrytimeout, 0, ieee80211_sysctl_msecs_ticks, "I",
     "Retry timeout (msec)");
 static int ieee80211_mesh_holdingtimeout = -1;
 
 SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, holdingtimeout,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &ieee80211_mesh_holdingtimeout, 0, ieee80211_sysctl_msecs_ticks, "I",
     "Holding state timeout (msec)");
 static int ieee80211_mesh_confirmtimeout = -1;
 SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, confirmtimeout,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &ieee80211_mesh_confirmtimeout, 0, ieee80211_sysctl_msecs_ticks, "I",
     "Confirm state timeout (msec)");
 static int ieee80211_mesh_backofftimeout = -1;
 SYSCTL_PROC(_net_wlan_mesh, OID_AUTO, backofftimeout,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &ieee80211_mesh_backofftimeout, 0, ieee80211_sysctl_msecs_ticks, "I",
     "Backoff timeout (msec). This is to throutles peering forever when "
     "not receiving answer or is rejected by a neighbor");
 static int ieee80211_mesh_maxretries = 2;
 SYSCTL_INT(_net_wlan_mesh, OID_AUTO, maxretries, CTLFLAG_RW,
     &ieee80211_mesh_maxretries, 0,
     "Maximum retries during peer link establishment");
 static int ieee80211_mesh_maxholding = 2;
 SYSCTL_INT(_net_wlan_mesh, OID_AUTO, maxholding, CTLFLAG_RW,
     &ieee80211_mesh_maxholding, 0,
     "Maximum times we are allowed to transition to HOLDING state before "
     "backinoff during peer link establishment");
 
 static const uint8_t broadcastaddr[IEEE80211_ADDR_LEN] =
 	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 static	ieee80211_recv_action_func mesh_recv_action_meshpeering_open;
 static	ieee80211_recv_action_func mesh_recv_action_meshpeering_confirm;
 static	ieee80211_recv_action_func mesh_recv_action_meshpeering_close;
 static	ieee80211_recv_action_func mesh_recv_action_meshlmetric;
 static	ieee80211_recv_action_func mesh_recv_action_meshgate;
 
 static	ieee80211_send_action_func mesh_send_action_meshpeering_open;
 static	ieee80211_send_action_func mesh_send_action_meshpeering_confirm;
 static	ieee80211_send_action_func mesh_send_action_meshpeering_close;
 static	ieee80211_send_action_func mesh_send_action_meshlmetric;
 static	ieee80211_send_action_func mesh_send_action_meshgate;
 
 static const struct ieee80211_mesh_proto_metric mesh_metric_airtime = {
 	.mpm_descr	= "AIRTIME",
 	.mpm_ie		= IEEE80211_MESHCONF_METRIC_AIRTIME,
 	.mpm_metric	= mesh_airtime_calc,
 };
 
 static struct ieee80211_mesh_proto_path		mesh_proto_paths[4];
 static struct ieee80211_mesh_proto_metric	mesh_proto_metrics[4];
 
 MALLOC_DEFINE(M_80211_MESH_PREQ, "80211preq", "802.11 MESH Path Request frame");
 MALLOC_DEFINE(M_80211_MESH_PREP, "80211prep", "802.11 MESH Path Reply frame");
 MALLOC_DEFINE(M_80211_MESH_PERR, "80211perr", "802.11 MESH Path Error frame");
 
 /* The longer one of the lifetime should be stored as new lifetime */
 #define MESH_ROUTE_LIFETIME_MAX(a, b)	(a > b ? a : b)
 
 MALLOC_DEFINE(M_80211_MESH_RT, "80211mesh_rt", "802.11s routing table");
 MALLOC_DEFINE(M_80211_MESH_GT_RT, "80211mesh_gt", "802.11s known gates table");
 
 /*
  * Helper functions to manipulate the Mesh routing table.
  */
 
 static struct ieee80211_mesh_route *
 mesh_rt_find_locked(struct ieee80211_mesh_state *ms,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_route *rt;
 
 	MESH_RT_LOCK_ASSERT(ms);
 
 	TAILQ_FOREACH(rt, &ms->ms_routes, rt_next) {
 		if (IEEE80211_ADDR_EQ(dest, rt->rt_dest))
 			return rt;
 	}
 	return NULL;
 }
 
 static struct ieee80211_mesh_route *
 mesh_rt_add_locked(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt;
 
 	KASSERT(!IEEE80211_ADDR_EQ(broadcastaddr, dest),
 	    ("%s: adding broadcast to the routing table", __func__));
 
 	MESH_RT_LOCK_ASSERT(ms);
 
 	rt = IEEE80211_MALLOC(ALIGN(sizeof(struct ieee80211_mesh_route)) +
 	    ms->ms_ppath->mpp_privlen, M_80211_MESH_RT,
 	    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (rt != NULL) {
 		rt->rt_vap = vap;
 		IEEE80211_ADDR_COPY(rt->rt_dest, dest);
 		rt->rt_priv = (void *)ALIGN(&rt[1]);
 		MESH_RT_ENTRY_LOCK_INIT(rt, "MBSS_RT");
 		callout_init(&rt->rt_discovery, 1);
 		rt->rt_updtime = ticks;	/* create time */
 		TAILQ_INSERT_TAIL(&ms->ms_routes, rt, rt_next);
 	}
 	return rt;
 }
 
 struct ieee80211_mesh_route *
 ieee80211_mesh_rt_find(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt;
 
 	MESH_RT_LOCK(ms);
 	rt = mesh_rt_find_locked(ms, dest);
 	MESH_RT_UNLOCK(ms);
 	return rt;
 }
 
 struct ieee80211_mesh_route *
 ieee80211_mesh_rt_add(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt;
 
 	KASSERT(ieee80211_mesh_rt_find(vap, dest) == NULL,
 	    ("%s: duplicate entry in the routing table", __func__));
 	KASSERT(!IEEE80211_ADDR_EQ(vap->iv_myaddr, dest),
 	    ("%s: adding self to the routing table", __func__));
 
 	MESH_RT_LOCK(ms);
 	rt = mesh_rt_add_locked(vap, dest);
 	MESH_RT_UNLOCK(ms);
 	return rt;
 }
 
 /*
  * Update the route lifetime and returns the updated lifetime.
  * If new_lifetime is zero and route is timedout it will be invalidated.
  * new_lifetime is in msec
  */
 int
 ieee80211_mesh_rt_update(struct ieee80211_mesh_route *rt, int new_lifetime)
 {
 	int timesince, now;
 	uint32_t lifetime = 0;
 
 	KASSERT(rt != NULL, ("route is NULL"));
 
 	now = ticks;
 	MESH_RT_ENTRY_LOCK(rt);
 
 	/* dont clobber a proxy entry gated by us */
 	if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY && rt->rt_nhops == 0) {
 		MESH_RT_ENTRY_UNLOCK(rt);
 		return rt->rt_lifetime;
 	}
 
 	timesince = ticks_to_msecs(now - rt->rt_updtime);
 	rt->rt_updtime = now;
 	if (timesince >= rt->rt_lifetime) {
 		if (new_lifetime != 0) {
 			rt->rt_lifetime = new_lifetime;
 		}
 		else {
 			rt->rt_flags &= ~IEEE80211_MESHRT_FLAGS_VALID;
 			rt->rt_lifetime = 0;
 		}
 	} else {
 		/* update what is left of lifetime */
 		rt->rt_lifetime = rt->rt_lifetime - timesince;
 		rt->rt_lifetime  = MESH_ROUTE_LIFETIME_MAX(
 			new_lifetime, rt->rt_lifetime);
 	}
 	lifetime = rt->rt_lifetime;
 	MESH_RT_ENTRY_UNLOCK(rt);
 
 	return lifetime;
 }
 
 /*
  * Add a proxy route (as needed) for the specified destination.
  */
 void
 ieee80211_mesh_proxy_check(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt;
 
 	MESH_RT_LOCK(ms);
 	rt = mesh_rt_find_locked(ms, dest);
 	if (rt == NULL) {
 		rt = mesh_rt_add_locked(vap, dest);
 		if (rt == NULL) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest,
 			    "%s", "unable to add proxy entry");
 			vap->iv_stats.is_mesh_rtaddfailed++;
 		} else {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest,
 			    "%s", "add proxy entry");
 			IEEE80211_ADDR_COPY(rt->rt_mesh_gate, vap->iv_myaddr);
 			IEEE80211_ADDR_COPY(rt->rt_nexthop, vap->iv_myaddr);
 			rt->rt_flags |= IEEE80211_MESHRT_FLAGS_VALID
 				     |  IEEE80211_MESHRT_FLAGS_PROXY;
 		}
 	} else if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) {
 		KASSERT(rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY,
 		    ("no proxy flag for poxy entry"));
 		struct ieee80211com *ic = vap->iv_ic;
 		/*
 		 * Fix existing entry created by received frames from
 		 * stations that have some memory of dest.  We also
 		 * flush any frames held on the staging queue; delivering
 		 * them is too much trouble right now.
 		 */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest,
 		    "%s", "fix proxy entry");
 		IEEE80211_ADDR_COPY(rt->rt_nexthop, vap->iv_myaddr);
 		rt->rt_flags |= IEEE80211_MESHRT_FLAGS_VALID
 			     |  IEEE80211_MESHRT_FLAGS_PROXY;
 		/* XXX belongs in hwmp */
 		ieee80211_ageq_drain_node(&ic->ic_stageq,
 		   (void *)(uintptr_t) ieee80211_mac_hash(ic, dest));
 		/* XXX stat? */
 	}
 	MESH_RT_UNLOCK(ms);
 }
 
 static __inline void
 mesh_rt_del(struct ieee80211_mesh_state *ms, struct ieee80211_mesh_route *rt)
 {
 	TAILQ_REMOVE(&ms->ms_routes, rt, rt_next);
 	/*
 	 * Grab the lock before destroying it, to be sure no one else
 	 * is holding the route.
 	 */
 	MESH_RT_ENTRY_LOCK(rt);
 	callout_drain(&rt->rt_discovery);
 	MESH_RT_ENTRY_LOCK_DESTROY(rt);
 	IEEE80211_FREE(rt, M_80211_MESH_RT);
 }
 
 void
 ieee80211_mesh_rt_del(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt, *next;
 
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next) {
 		if (IEEE80211_ADDR_EQ(rt->rt_dest, dest)) {
 			if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) {
 				ms->ms_ppath->mpp_senderror(vap, dest, rt,
 				    IEEE80211_REASON_MESH_PERR_NO_PROXY);
 			} else {
 				ms->ms_ppath->mpp_senderror(vap, dest, rt,
 				    IEEE80211_REASON_MESH_PERR_DEST_UNREACH);
 			}
 			mesh_rt_del(ms, rt);
 			MESH_RT_UNLOCK(ms);
 			return;
 		}
 	}
 	MESH_RT_UNLOCK(ms);
 }
 
 void
 ieee80211_mesh_rt_flush(struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt, *next;
 
 	if (ms == NULL)
 		return;
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next)
 		mesh_rt_del(ms, rt);
 	MESH_RT_UNLOCK(ms);
 }
 
 void
 ieee80211_mesh_rt_flush_peer(struct ieee80211vap *vap,
     const uint8_t peer[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt, *next;
 
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next) {
 		if (IEEE80211_ADDR_EQ(rt->rt_nexthop, peer))
 			mesh_rt_del(ms, rt);
 	}
 	MESH_RT_UNLOCK(ms);
 }
 
 /*
  * Flush expired routing entries, i.e. those in invalid state for
  * some time.
  */
 static void
 mesh_rt_flush_invalid(struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt, *next;
 
 	if (ms == NULL)
 		return;
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(rt, &ms->ms_routes, rt_next, next) {
 		/* Discover paths will be deleted by their own callout */
 		if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_DISCOVER)
 			continue;
 		ieee80211_mesh_rt_update(rt, 0);
 		if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0)
 			mesh_rt_del(ms, rt);
 	}
 	MESH_RT_UNLOCK(ms);
 }
 
 int
 ieee80211_mesh_register_proto_path(const struct ieee80211_mesh_proto_path *mpp)
 {
 	int i, firstempty = -1;
 
 	for (i = 0; i < nitems(mesh_proto_paths); i++) {
 		if (strncmp(mpp->mpp_descr, mesh_proto_paths[i].mpp_descr,
 		    IEEE80211_MESH_PROTO_DSZ) == 0)
 			return EEXIST;
 		if (!mesh_proto_paths[i].mpp_active && firstempty == -1)
 			firstempty = i;
 	}
 	if (firstempty < 0)
 		return ENOSPC;
 	memcpy(&mesh_proto_paths[firstempty], mpp, sizeof(*mpp));
 	mesh_proto_paths[firstempty].mpp_active = 1;
 	return 0;
 }
 
 int
 ieee80211_mesh_register_proto_metric(const struct
     ieee80211_mesh_proto_metric *mpm)
 {
 	int i, firstempty = -1;
 
 	for (i = 0; i < nitems(mesh_proto_metrics); i++) {
 		if (strncmp(mpm->mpm_descr, mesh_proto_metrics[i].mpm_descr,
 		    IEEE80211_MESH_PROTO_DSZ) == 0)
 			return EEXIST;
 		if (!mesh_proto_metrics[i].mpm_active && firstempty == -1)
 			firstempty = i;
 	}
 	if (firstempty < 0)
 		return ENOSPC;
 	memcpy(&mesh_proto_metrics[firstempty], mpm, sizeof(*mpm));
 	mesh_proto_metrics[firstempty].mpm_active = 1;
 	return 0;
 }
 
 static int
 mesh_select_proto_path(struct ieee80211vap *vap, const char *name)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	int i;
 
 	for (i = 0; i < nitems(mesh_proto_paths); i++) {
 		if (strcasecmp(mesh_proto_paths[i].mpp_descr, name) == 0) {
 			ms->ms_ppath = &mesh_proto_paths[i];
 			return 0;
 		}
 	}
 	return ENOENT;
 }
 
 static int
 mesh_select_proto_metric(struct ieee80211vap *vap, const char *name)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	int i;
 
 	for (i = 0; i < nitems(mesh_proto_metrics); i++) {
 		if (strcasecmp(mesh_proto_metrics[i].mpm_descr, name) == 0) {
 			ms->ms_pmetric = &mesh_proto_metrics[i];
 			return 0;
 		}
 	}
 	return ENOENT;
 }
 
 static void
 mesh_gatemode_setup(struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	/*
 	 * NB: When a mesh gate is running as a ROOT it shall
 	 * not send out periodic GANNs but instead mark the
 	 * mesh gate flag for the corresponding proactive PREQ
 	 * and RANN frames.
 	 */
 	if (ms->ms_flags & IEEE80211_MESHFLAGS_ROOT ||
 	    (ms->ms_flags & IEEE80211_MESHFLAGS_GATE) == 0) {
 		callout_drain(&ms->ms_gatetimer);
 		return ;
 	}
 	callout_reset(&ms->ms_gatetimer, ieee80211_mesh_gateint,
 	    mesh_gatemode_cb, vap);
 }
 
 static void
 mesh_gatemode_cb(void *arg)
 {
 	struct ieee80211vap *vap = (struct ieee80211vap *)arg;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_meshgann_ie gann;
 
 	gann.gann_flags = 0; /* Reserved */
 	gann.gann_hopcount = 0;
 	gann.gann_ttl = ms->ms_ttl;
 	IEEE80211_ADDR_COPY(gann.gann_addr, vap->iv_myaddr);
 	gann.gann_seq = ms->ms_gateseq++;
 	gann.gann_interval = ieee80211_mesh_gateint;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_MESH, vap->iv_bss,
 	    "send broadcast GANN (seq %u)", gann.gann_seq);
 
 	ieee80211_send_action(vap->iv_bss, IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_GANN, &gann);
 	mesh_gatemode_setup(vap);
 }
 
 static void
 ieee80211_mesh_init(void)
 {
 
 	memset(mesh_proto_paths, 0, sizeof(mesh_proto_paths));
 	memset(mesh_proto_metrics, 0, sizeof(mesh_proto_metrics));
 
 	/*
 	 * Setup mesh parameters that depends on the clock frequency.
 	 */
 	ieee80211_mesh_gateint = msecs_to_ticks(10000);
 	ieee80211_mesh_retrytimeout = msecs_to_ticks(40);
 	ieee80211_mesh_holdingtimeout = msecs_to_ticks(40);
 	ieee80211_mesh_confirmtimeout = msecs_to_ticks(40);
 	ieee80211_mesh_backofftimeout = msecs_to_ticks(5000);
 
 	/*
 	 * Register action frame handlers.
 	 */
 	ieee80211_recv_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_OPEN,
 	    mesh_recv_action_meshpeering_open);
 	ieee80211_recv_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 	    mesh_recv_action_meshpeering_confirm);
 	ieee80211_recv_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_CLOSE,
 	    mesh_recv_action_meshpeering_close);
 	ieee80211_recv_action_register(IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_LMETRIC, mesh_recv_action_meshlmetric);
 	ieee80211_recv_action_register(IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_GANN, mesh_recv_action_meshgate);
 
 	ieee80211_send_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_OPEN,
 	    mesh_send_action_meshpeering_open);
 	ieee80211_send_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 	    mesh_send_action_meshpeering_confirm);
 	ieee80211_send_action_register(IEEE80211_ACTION_CAT_SELF_PROT,
 	    IEEE80211_ACTION_MESHPEERING_CLOSE,
 	    mesh_send_action_meshpeering_close);
 	ieee80211_send_action_register(IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_LMETRIC,
 	    mesh_send_action_meshlmetric);
 	ieee80211_send_action_register(IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_GANN,
 	    mesh_send_action_meshgate);
 
 	/*
 	 * Register Airtime Link Metric.
 	 */
 	ieee80211_mesh_register_proto_metric(&mesh_metric_airtime);
 
 }
 SYSINIT(wlan_mesh, SI_SUB_DRIVERS, SI_ORDER_FIRST, ieee80211_mesh_init, NULL);
 
 void
 ieee80211_mesh_attach(struct ieee80211com *ic)
 {
 	ic->ic_vattach[IEEE80211_M_MBSS] = mesh_vattach;
 }
 
 void
 ieee80211_mesh_detach(struct ieee80211com *ic)
 {
 }
 
 static void
 mesh_vdetach_peers(void *arg, struct ieee80211_node *ni)
 {
 	struct ieee80211com *ic = ni->ni_ic;
 	uint16_t args[3];
 
 	if (ni->ni_mlstate == IEEE80211_NODE_MESH_ESTABLISHED) {
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE,
 		    args);
 	}
 	callout_drain(&ni->ni_mltimer);
 	/* XXX belongs in hwmp */
 	ieee80211_ageq_drain_node(&ic->ic_stageq,
 	   (void *)(uintptr_t) ieee80211_mac_hash(ic, ni->ni_macaddr));
 }
 
 static void
 mesh_vdetach(struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	callout_drain(&ms->ms_cleantimer);
 	ieee80211_iterate_nodes(&vap->iv_ic->ic_sta, mesh_vdetach_peers,
 	    NULL);
 	ieee80211_mesh_rt_flush(vap);
 	MESH_RT_LOCK_DESTROY(ms);
 	ms->ms_ppath->mpp_vdetach(vap);
 	IEEE80211_FREE(vap->iv_mesh, M_80211_VAP);
 	vap->iv_mesh = NULL;
 }
 
 static void
 mesh_vattach(struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms;
 	vap->iv_newstate = mesh_newstate;
 	vap->iv_input = mesh_input;
 	vap->iv_opdetach = mesh_vdetach;
 	vap->iv_recv_mgmt = mesh_recv_mgmt;
 	vap->iv_recv_ctl = mesh_recv_ctl;
 	ms = IEEE80211_MALLOC(sizeof(struct ieee80211_mesh_state), M_80211_VAP,
 	    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (ms == NULL) {
 		printf("%s: couldn't alloc MBSS state\n", __func__);
 		return;
 	}
 	vap->iv_mesh = ms;
 	ms->ms_seq = 0;
 	ms->ms_flags = (IEEE80211_MESHFLAGS_AP | IEEE80211_MESHFLAGS_FWD);
 	ms->ms_ttl = IEEE80211_MESH_DEFAULT_TTL;
 	TAILQ_INIT(&ms->ms_known_gates);
 	TAILQ_INIT(&ms->ms_routes);
 	MESH_RT_LOCK_INIT(ms, "MBSS");
 	callout_init(&ms->ms_cleantimer, 1);
 	callout_init(&ms->ms_gatetimer, 1);
 	ms->ms_gateseq = 0;
 	mesh_select_proto_metric(vap, "AIRTIME");
 	KASSERT(ms->ms_pmetric, ("ms_pmetric == NULL"));
 	mesh_select_proto_path(vap, "HWMP");
 	KASSERT(ms->ms_ppath, ("ms_ppath == NULL"));
 	ms->ms_ppath->mpp_vattach(vap);
 }
 
 /*
  * IEEE80211_M_MBSS vap state machine handler.
  */
 static int
 mesh_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_node *ni;
 	enum ieee80211_state ostate;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	ostate = vap->iv_state;
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s -> %s (%d)\n",
 	    __func__, ieee80211_state_name[ostate],
 	    ieee80211_state_name[nstate], arg);
 	vap->iv_state = nstate;		/* state transition */
 	if (ostate != IEEE80211_S_SCAN)
 		ieee80211_cancel_scan(vap);	/* background scan */
 	ni = vap->iv_bss;			/* NB: no reference held */
 	if (nstate != IEEE80211_S_RUN && ostate == IEEE80211_S_RUN) {
 		callout_drain(&ms->ms_cleantimer);
 		callout_drain(&ms->ms_gatetimer);
 	}
 	switch (nstate) {
 	case IEEE80211_S_INIT:
 		switch (ostate) {
 		case IEEE80211_S_SCAN:
 			ieee80211_cancel_scan(vap);
 			break;
 		case IEEE80211_S_CAC:
 			ieee80211_dfs_cac_stop(vap);
 			break;
 		case IEEE80211_S_RUN:
 			ieee80211_iterate_nodes(&ic->ic_sta,
 			    mesh_vdetach_peers, NULL);
 			break;
 		default:
 			break;
 		}
 		if (ostate != IEEE80211_S_INIT) {
 			/* NB: optimize INIT -> INIT case */
 			ieee80211_reset_bss(vap);
 			ieee80211_mesh_rt_flush(vap);
 		}
 		break;
 	case IEEE80211_S_SCAN:
 		switch (ostate) {
 		case IEEE80211_S_INIT:
 			if (vap->iv_des_chan != IEEE80211_CHAN_ANYC &&
 			    !IEEE80211_IS_CHAN_RADAR(vap->iv_des_chan) &&
 			    ms->ms_idlen != 0) {
 				/*
 				 * Already have a channel and a mesh ID; bypass
 				 * the scan and startup immediately.
 				 */
 				ieee80211_create_ibss(vap, vap->iv_des_chan);
 				break;
 			}
 			/*
 			 * Initiate a scan.  We can come here as a result
 			 * of an IEEE80211_IOC_SCAN_REQ too in which case
 			 * the vap will be marked with IEEE80211_FEXT_SCANREQ
 			 * and the scan request parameters will be present
 			 * in iv_scanreq.  Otherwise we do the default.
 			*/
 			if (vap->iv_flags_ext & IEEE80211_FEXT_SCANREQ) {
 				ieee80211_check_scan(vap,
 				    vap->iv_scanreq_flags,
 				    vap->iv_scanreq_duration,
 				    vap->iv_scanreq_mindwell,
 				    vap->iv_scanreq_maxdwell,
 				    vap->iv_scanreq_nssid, vap->iv_scanreq_ssid);
 				vap->iv_flags_ext &= ~IEEE80211_FEXT_SCANREQ;
 			} else
 				ieee80211_check_scan_current(vap);
 			break;
 		default:
 			break;
 		}
 		break;
 	case IEEE80211_S_CAC:
 		/*
 		 * Start CAC on a DFS channel.  We come here when starting
 		 * a bss on a DFS channel (see ieee80211_create_ibss).
 		 */
 		ieee80211_dfs_cac_start(vap);
 		break;
 	case IEEE80211_S_RUN:
 		switch (ostate) {
 		case IEEE80211_S_INIT:
 			/*
 			 * Already have a channel; bypass the
 			 * scan and startup immediately.
 			 * Note that ieee80211_create_ibss will call
 			 * back to do a RUN->RUN state change.
 			 */
 			ieee80211_create_ibss(vap,
 			    ieee80211_ht_adjust_channel(ic,
 				ic->ic_curchan, vap->iv_flags_ht));
 			/* NB: iv_bss is changed on return */
 			break;
 		case IEEE80211_S_CAC:
 			/*
 			 * NB: This is the normal state change when CAC
 			 * expires and no radar was detected; no need to
 			 * clear the CAC timer as it's already expired.
 			 */
 			/* fall thru... */
 		case IEEE80211_S_CSA:
 #if 0
 			/*
 			 * Shorten inactivity timer of associated stations
 			 * to weed out sta's that don't follow a CSA.
 			 */
 			ieee80211_iterate_nodes(&ic->ic_sta, sta_csa, vap);
 #endif
 			/*
 			 * Update bss node channel to reflect where
 			 * we landed after CSA.
 			 */
 			ieee80211_node_set_chan(ni,
 			    ieee80211_ht_adjust_channel(ic, ic->ic_curchan,
 				ieee80211_htchanflags(ni->ni_chan)));
 			/* XXX bypass debug msgs */
 			break;
 		case IEEE80211_S_SCAN:
 		case IEEE80211_S_RUN:
 #ifdef IEEE80211_DEBUG
 			if (ieee80211_msg_debug(vap)) {
 				ieee80211_note(vap,
 				    "synchronized with %s meshid ",
 				    ether_sprintf(ni->ni_meshid));
 				ieee80211_print_essid(ni->ni_meshid,
 				    ni->ni_meshidlen);
 				/* XXX MCS/HT */
 				printf(" channel %d\n",
 				    ieee80211_chan2ieee(ic, ic->ic_curchan));
 			}
 #endif
 			break;
 		default:
 			break;
 		}
 		ieee80211_node_authorize(ni);
 		callout_reset(&ms->ms_cleantimer, ms->ms_ppath->mpp_inact,
                     mesh_rt_cleanup_cb, vap);
 		mesh_gatemode_setup(vap);
 		break;
 	default:
 		break;
 	}
 	/* NB: ostate not nstate */
 	ms->ms_ppath->mpp_newstate(vap, ostate, arg);
 	return 0;
 }
 
 static void
 mesh_rt_cleanup_cb(void *arg)
 {
 	struct ieee80211vap *vap = arg;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	mesh_rt_flush_invalid(vap);
 	callout_reset(&ms->ms_cleantimer, ms->ms_ppath->mpp_inact,
 	    mesh_rt_cleanup_cb, vap);
 }
 
 /*
  * Mark a mesh STA as gate and return a pointer to it.
  * If this is first time, we create a new gate route.
  * Always update the path route to this mesh gate.
  */
 struct ieee80211_mesh_gate_route *
 ieee80211_mesh_mark_gate(struct ieee80211vap *vap, const uint8_t *addr,
     struct ieee80211_mesh_route *rt)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_gate_route *gr = NULL, *next;
 	int found = 0;
 
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(gr, &ms->ms_known_gates, gr_next, next) {
 		if (IEEE80211_ADDR_EQ(gr->gr_addr, addr)) {
 			found = 1;
 			break;
 		}
 	}
 
 	if (!found) {
 		/* New mesh gate add it to known table. */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, addr,
 		    "%s", "stored new gate information from pro-PREQ.");
 		gr = IEEE80211_MALLOC(ALIGN(sizeof(struct ieee80211_mesh_gate_route)),
 		    M_80211_MESH_GT_RT,
 		    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 		IEEE80211_ADDR_COPY(gr->gr_addr, addr);
 		TAILQ_INSERT_TAIL(&ms->ms_known_gates, gr, gr_next);
 	}
 	gr->gr_route = rt;
 	/* TODO: link from path route to gate route */
 	MESH_RT_UNLOCK(ms);
 
 	return gr;
 }
 
 /*
  * Helper function to note the Mesh Peer Link FSM change.
  */
 static void
 mesh_linkchange(struct ieee80211_node *ni, enum ieee80211_mesh_mlstate state)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 #ifdef IEEE80211_DEBUG
 	static const char *meshlinkstates[] = {
 		[IEEE80211_NODE_MESH_IDLE]		= "IDLE",
 		[IEEE80211_NODE_MESH_OPENSNT]		= "OPEN SENT",
 		[IEEE80211_NODE_MESH_OPENRCV]		= "OPEN RECEIVED",
 		[IEEE80211_NODE_MESH_CONFIRMRCV]	= "CONFIRM RECEIVED",
 		[IEEE80211_NODE_MESH_ESTABLISHED]	= "ESTABLISHED",
 		[IEEE80211_NODE_MESH_HOLDING]		= "HOLDING"
 	};
 #endif
 	IEEE80211_NOTE(vap, IEEE80211_MSG_MESH,
 	    ni, "peer link: %s -> %s",
 	    meshlinkstates[ni->ni_mlstate], meshlinkstates[state]);
 
 	/* track neighbor count */
 	if (state == IEEE80211_NODE_MESH_ESTABLISHED &&
 	    ni->ni_mlstate != IEEE80211_NODE_MESH_ESTABLISHED) {
 		KASSERT(ms->ms_neighbors < 65535, ("neighbor count overflow"));
 		ms->ms_neighbors++;
 		ieee80211_beacon_notify(vap, IEEE80211_BEACON_MESHCONF);
 	} else if (ni->ni_mlstate == IEEE80211_NODE_MESH_ESTABLISHED &&
 	    state != IEEE80211_NODE_MESH_ESTABLISHED) {
 		KASSERT(ms->ms_neighbors > 0, ("neighbor count 0"));
 		ms->ms_neighbors--;
 		ieee80211_beacon_notify(vap, IEEE80211_BEACON_MESHCONF);
 	}
 	ni->ni_mlstate = state;
 	switch (state) {
 	case IEEE80211_NODE_MESH_HOLDING:
 		ms->ms_ppath->mpp_peerdown(ni);
 		break;
 	case IEEE80211_NODE_MESH_ESTABLISHED:
 		ieee80211_mesh_discover(vap, ni->ni_macaddr, NULL);
 		break;
 	default:
 		break;
 	}
 }
 
 /*
  * Helper function to generate a unique local ID required for mesh
  * peer establishment.
  */
 static void
 mesh_checkid(void *arg, struct ieee80211_node *ni)
 {
 	uint16_t *r = arg;
 
 	if (*r == ni->ni_mllid)
 		*(uint16_t *)arg = 0;
 }
 
 static uint32_t
 mesh_generateid(struct ieee80211vap *vap)
 {
 	int maxiter = 4;
 	uint16_t r;
 
 	do {
 		net80211_get_random_bytes(&r, 2);
 		ieee80211_iterate_nodes(&vap->iv_ic->ic_sta, mesh_checkid, &r);
 		maxiter--;
 	} while (r == 0 && maxiter > 0);
 	return r;
 }
 
 /*
  * Verifies if we already received this packet by checking its
  * sequence number.
  * Returns 0 if the frame is to be accepted, 1 otherwise.
  */
 static int
 mesh_checkpseq(struct ieee80211vap *vap,
     const uint8_t source[IEEE80211_ADDR_LEN], uint32_t seq)
 {
 	struct ieee80211_mesh_route *rt;
 
 	rt = ieee80211_mesh_rt_find(vap, source);
 	if (rt == NULL) {
 		rt = ieee80211_mesh_rt_add(vap, source);
 		if (rt == NULL) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, source,
 			    "%s", "add mcast route failed");
 			vap->iv_stats.is_mesh_rtaddfailed++;
 			return 1;
 		}
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, source,
 		    "add mcast route, mesh seqno %d", seq);
 		rt->rt_lastmseq = seq;
 		return 0;
 	}
 	if (IEEE80211_MESH_SEQ_GEQ(rt->rt_lastmseq, seq)) {
 		return 1;
 	} else {
 		rt->rt_lastmseq = seq;
 		return 0;
 	}
 }
 
 /*
  * Iterate the routing table and locate the next hop.
  */
 struct ieee80211_node *
 ieee80211_mesh_find_txnode(struct ieee80211vap *vap,
     const uint8_t dest[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_mesh_route *rt;
 
 	rt = ieee80211_mesh_rt_find(vap, dest);
 	if (rt == NULL)
 		return NULL;
 	if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest,
 		    "%s: !valid, flags 0x%x", __func__, rt->rt_flags);
 		/* XXX stat */
 		return NULL;
 	}
 	if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) {
 		rt = ieee80211_mesh_rt_find(vap, rt->rt_mesh_gate);
 		if (rt == NULL) return NULL;
 		if ((rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, dest,
 			    "%s: meshgate !valid, flags 0x%x", __func__,
 			    rt->rt_flags);
 			/* XXX stat */
 			return NULL;
 		}
 	}
 	return ieee80211_find_txnode(vap, rt->rt_nexthop);
 }
 
 static void
 mesh_transmit_to_gate(struct ieee80211vap *vap, struct mbuf *m,
     struct ieee80211_mesh_route *rt_gate)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_node *ni;
 
 	IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic);
 
 	ni = ieee80211_mesh_find_txnode(vap, rt_gate->rt_dest);
 	if (ni == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Send through the VAP packet transmit path.
 	 * This consumes the node ref grabbed above and
 	 * the mbuf, regardless of whether there's a problem
 	 * or not.
 	 */
 	(void) ieee80211_vap_pkt_send_dest(vap, m, ni);
 }
 
 /*
  * Forward the queued frames to known valid mesh gates.
  * Assume destination to be outside the MBSS (i.e. proxy entry),
  * If no valid mesh gates are known silently discard queued frames.
  * After transmitting frames to all known valid mesh gates, this route
  * will be marked invalid, and a new path discovery will happen in the hopes
  * that (at least) one of the mesh gates have a new proxy entry for us to use.
  */
 void
 ieee80211_mesh_forward_to_gates(struct ieee80211vap *vap,
     struct ieee80211_mesh_route *rt_dest)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt_gate;
 	struct ieee80211_mesh_gate_route *gr = NULL, *gr_next;
 	struct mbuf *m, *mcopy, *next;
 
 	IEEE80211_TX_UNLOCK_ASSERT(ic);
 
 	KASSERT( rt_dest->rt_flags == IEEE80211_MESHRT_FLAGS_DISCOVER,
 	    ("Route is not marked with IEEE80211_MESHRT_FLAGS_DISCOVER"));
 
 	/* XXX: send to more than one valid mash gate */
 	MESH_RT_LOCK(ms);
 
 	m = ieee80211_ageq_remove(&ic->ic_stageq,
 	    (struct ieee80211_node *)(uintptr_t)
 	    ieee80211_mac_hash(ic, rt_dest->rt_dest));
 
 	TAILQ_FOREACH_SAFE(gr, &ms->ms_known_gates, gr_next, gr_next) {
 		rt_gate = gr->gr_route;
 		if (rt_gate == NULL) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP,
 				rt_dest->rt_dest,
 				"mesh gate with no path %6D",
 				gr->gr_addr, ":");
 			continue;
 		}
 		if ((rt_gate->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) == 0)
 			continue;
 		KASSERT(rt_gate->rt_flags & IEEE80211_MESHRT_FLAGS_GATE,
 		    ("route not marked as a mesh gate"));
 		KASSERT((rt_gate->rt_flags &
 			IEEE80211_MESHRT_FLAGS_PROXY) == 0,
 			("found mesh gate that is also marked porxy"));
 		/*
 		 * convert route to a proxy route gated by the current
 		 * mesh gate, this is needed so encap can built data
 		 * frame with correct address.
 		 */
 		rt_dest->rt_flags = IEEE80211_MESHRT_FLAGS_PROXY |
 			IEEE80211_MESHRT_FLAGS_VALID;
 		rt_dest->rt_ext_seq = 1; /* random value */
 		IEEE80211_ADDR_COPY(rt_dest->rt_mesh_gate, rt_gate->rt_dest);
 		IEEE80211_ADDR_COPY(rt_dest->rt_nexthop, rt_gate->rt_nexthop);
 		rt_dest->rt_metric = rt_gate->rt_metric;
 		rt_dest->rt_nhops = rt_gate->rt_nhops;
 		ieee80211_mesh_rt_update(rt_dest, ms->ms_ppath->mpp_inact);
 		MESH_RT_UNLOCK(ms);
 		/* XXX: lock?? */
 		mcopy = m_dup(m, IEEE80211_M_NOWAIT);
 		for (; mcopy != NULL; mcopy = next) {
 			next = mcopy->m_nextpkt;
 			mcopy->m_nextpkt = NULL;
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_HWMP,
 			    rt_dest->rt_dest,
 			    "flush queued frame %p len %d", mcopy,
 			    mcopy->m_pkthdr.len);
 			mesh_transmit_to_gate(vap, mcopy, rt_gate);
 		}
 		MESH_RT_LOCK(ms);
 	}
 	rt_dest->rt_flags = 0; /* Mark invalid */
 	m_freem(m);
 	MESH_RT_UNLOCK(ms);
 }
 
 /*
  * Forward the specified frame.
  * Decrement the TTL and set TA to our MAC address.
  */
 static void
 mesh_forward(struct ieee80211vap *vap, struct mbuf *m,
     const struct ieee80211_meshcntl *mc)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ifnet *ifp = vap->iv_ifp;
 	const struct ieee80211_frame *wh =
 	    mtod(m, const struct ieee80211_frame *);
 	struct mbuf *mcopy;
 	struct ieee80211_meshcntl *mccopy;
 	struct ieee80211_frame *whcopy;
 	struct ieee80211_node *ni;
 	int err;
 
 	/* This is called from the RX path - don't hold this lock */
 	IEEE80211_TX_UNLOCK_ASSERT(ic);
 
 	/*
 	 * mesh ttl of 1 means we are the last one receiving it,
 	 * according to amendment we decrement and then check if
 	 * 0, if so we dont forward.
 	 */
 	if (mc->mc_ttl < 1) {
 		IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh,
 		    "%s", "frame not fwd'd, ttl 1");
 		vap->iv_stats.is_mesh_fwd_ttl++;
 		return;
 	}
 	if (!(ms->ms_flags & IEEE80211_MESHFLAGS_FWD)) {
 		IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh,
 		    "%s", "frame not fwd'd, fwding disabled");
 		vap->iv_stats.is_mesh_fwd_disabled++;
 		return;
 	}
 	mcopy = m_dup(m, IEEE80211_M_NOWAIT);
 	if (mcopy == NULL) {
 		IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh,
 		    "%s", "frame not fwd'd, cannot dup");
 		vap->iv_stats.is_mesh_fwd_nobuf++;
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return;
 	}
 	mcopy = m_pullup(mcopy, ieee80211_hdrspace(ic, wh) +
 	    sizeof(struct ieee80211_meshcntl));
 	if (mcopy == NULL) {
 		IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh,
 		    "%s", "frame not fwd'd, too short");
 		vap->iv_stats.is_mesh_fwd_tooshort++;
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(mcopy);
 		return;
 	}
 	whcopy = mtod(mcopy, struct ieee80211_frame *);
 	mccopy = (struct ieee80211_meshcntl *)
 	    (mtod(mcopy, uint8_t *) + ieee80211_hdrspace(ic, wh));
 	/* XXX clear other bits? */
 	whcopy->i_fc[1] &= ~IEEE80211_FC1_RETRY;
 	IEEE80211_ADDR_COPY(whcopy->i_addr2, vap->iv_myaddr);
 	if (IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 		ni = ieee80211_ref_node(vap->iv_bss);
 		mcopy->m_flags |= M_MCAST;
 	} else {
 		ni = ieee80211_mesh_find_txnode(vap, whcopy->i_addr3);
 		if (ni == NULL) {
 			/*
 			 * [Optional] any of the following three actions:
 			 * o silently discard
 			 * o trigger a path discovery
 			 * o inform TA that meshDA is unknown.
 			 */
 			IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_MESH, wh,
 			    "%s", "frame not fwd'd, no path");
 			ms->ms_ppath->mpp_senderror(vap, whcopy->i_addr3, NULL,
 			    IEEE80211_REASON_MESH_PERR_NO_FI);
 			vap->iv_stats.is_mesh_fwd_nopath++;
 			m_freem(mcopy);
 			return;
 		}
 		IEEE80211_ADDR_COPY(whcopy->i_addr1, ni->ni_macaddr);
 	}
 	KASSERT(mccopy->mc_ttl > 0, ("%s called with wrong ttl", __func__));
 	mccopy->mc_ttl--;
 
 	/* XXX calculate priority so drivers can find the tx queue */
 	M_WME_SETAC(mcopy, WME_AC_BE);
 
 	/* XXX do we know m_nextpkt is NULL? */
 	MPASS((mcopy->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	mcopy->m_pkthdr.rcvif = (void *) ni;
 
 	/*
 	 * XXX this bypasses all of the VAP TX handling; it passes frames
 	 * directly to the parent interface.
 	 *
 	 * Because of this, there's no TX lock being held as there's no
 	 * encaps state being used.
 	 *
 	 * Doing a direct parent transmit may not be the correct thing
 	 * to do here; we'll have to re-think this soon.
 	 */
 	IEEE80211_TX_LOCK(ic);
 	err = ieee80211_parent_xmitpkt(ic, mcopy);
 	IEEE80211_TX_UNLOCK(ic);
 	if (!err)
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 }
 
 static struct mbuf *
 mesh_decap(struct ieee80211vap *vap, struct mbuf *m, int hdrlen, int meshdrlen)
 {
 #define	WHDIR(wh)	((wh)->i_fc[1] & IEEE80211_FC1_DIR_MASK)
 #define	MC01(mc)	((const struct ieee80211_meshcntl_ae01 *)mc)
 	uint8_t b[sizeof(struct ieee80211_qosframe_addr4) +
 		  sizeof(struct ieee80211_meshcntl_ae10)];
 	const struct ieee80211_qosframe_addr4 *wh;
 	const struct ieee80211_meshcntl_ae10 *mc;
 	struct ether_header *eh;
 	struct llc *llc;
 	int ae;
 
 	if (m->m_len < hdrlen + sizeof(*llc) &&
 	    (m = m_pullup(m, hdrlen + sizeof(*llc))) == NULL) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_ANY,
 		    "discard data frame: %s", "m_pullup failed");
 		vap->iv_stats.is_rx_tooshort++;
 		return NULL;
 	}
 	memcpy(b, mtod(m, caddr_t), hdrlen);
 	wh = (const struct ieee80211_qosframe_addr4 *)&b[0];
 	mc = (const struct ieee80211_meshcntl_ae10 *)&b[hdrlen - meshdrlen];
 	KASSERT(WHDIR(wh) == IEEE80211_FC1_DIR_FROMDS ||
 		WHDIR(wh) == IEEE80211_FC1_DIR_DSTODS,
 	    ("bogus dir, fc 0x%x:0x%x", wh->i_fc[0], wh->i_fc[1]));
 
 	llc = (struct llc *)(mtod(m, caddr_t) + hdrlen);
 	if (llc->llc_dsap == LLC_SNAP_LSAP && llc->llc_ssap == LLC_SNAP_LSAP &&
 	    llc->llc_control == LLC_UI && llc->llc_snap.org_code[0] == 0 &&
 	    llc->llc_snap.org_code[1] == 0 && llc->llc_snap.org_code[2] == 0 &&
 	    /* NB: preserve AppleTalk frames that have a native SNAP hdr */
 	    !(llc->llc_snap.ether_type == htons(ETHERTYPE_AARP) ||
 	      llc->llc_snap.ether_type == htons(ETHERTYPE_IPX))) {
 		m_adj(m, hdrlen + sizeof(struct llc) - sizeof(*eh));
 		llc = NULL;
 	} else {
 		m_adj(m, hdrlen - sizeof(*eh));
 	}
 	eh = mtod(m, struct ether_header *);
 	ae = mc->mc_flags & IEEE80211_MESH_AE_MASK;
 	if (WHDIR(wh) == IEEE80211_FC1_DIR_FROMDS) {
 		IEEE80211_ADDR_COPY(eh->ether_dhost, wh->i_addr1);
 		if (ae == IEEE80211_MESH_AE_00) {
 			IEEE80211_ADDR_COPY(eh->ether_shost, wh->i_addr3);
 		} else if (ae == IEEE80211_MESH_AE_01) {
 			IEEE80211_ADDR_COPY(eh->ether_shost,
 			    MC01(mc)->mc_addr4);
 		} else {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 			    (const struct ieee80211_frame *)wh, NULL,
 			    "bad AE %d", ae);
 			vap->iv_stats.is_mesh_badae++;
 			m_freem(m);
 			return NULL;
 		}
 	} else {
 		if (ae == IEEE80211_MESH_AE_00) {
 			IEEE80211_ADDR_COPY(eh->ether_dhost, wh->i_addr3);
 			IEEE80211_ADDR_COPY(eh->ether_shost, wh->i_addr4);
 		} else if (ae == IEEE80211_MESH_AE_10) {
 			IEEE80211_ADDR_COPY(eh->ether_dhost, mc->mc_addr5);
 			IEEE80211_ADDR_COPY(eh->ether_shost, mc->mc_addr6);
 		} else {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 			    (const struct ieee80211_frame *)wh, NULL,
 			    "bad AE %d", ae);
 			vap->iv_stats.is_mesh_badae++;
 			m_freem(m);
 			return NULL;
 		}
 	}
 #ifndef __NO_STRICT_ALIGNMENT
 	if (!ALIGNED_POINTER(mtod(m, caddr_t) + sizeof(*eh), uint32_t)) {
 		m = ieee80211_realign(vap, m, sizeof(*eh));
 		if (m == NULL)
 			return NULL;
 	}
 #endif /* !__NO_STRICT_ALIGNMENT */
 	if (llc != NULL) {
 		eh = mtod(m, struct ether_header *);
 		eh->ether_type = htons(m->m_pkthdr.len - sizeof(*eh));
 	}
 	return m;
 #undef	WDIR
 #undef	MC01
 }
 
 /*
  * Return non-zero if the unicast mesh data frame should be processed
  * locally.  Frames that are not proxy'd have our address, otherwise
  * we need to consult the routing table to look for a proxy entry.
  */
 static __inline int
 mesh_isucastforme(struct ieee80211vap *vap, const struct ieee80211_frame *wh,
     const struct ieee80211_meshcntl *mc)
 {
 	int ae = mc->mc_flags & 3;
 
 	KASSERT((wh->i_fc[1] & IEEE80211_FC1_DIR_MASK) == IEEE80211_FC1_DIR_DSTODS,
 	    ("bad dir 0x%x:0x%x", wh->i_fc[0], wh->i_fc[1]));
 	KASSERT(ae == IEEE80211_MESH_AE_00 || ae == IEEE80211_MESH_AE_10,
 	    ("bad AE %d", ae));
 	if (ae == IEEE80211_MESH_AE_10) {	/* ucast w/ proxy */
 		const struct ieee80211_meshcntl_ae10 *mc10 =
 		    (const struct ieee80211_meshcntl_ae10 *) mc;
 		struct ieee80211_mesh_route *rt =
 		    ieee80211_mesh_rt_find(vap, mc10->mc_addr5);
 		/* check for proxy route to ourself */
 		return (rt != NULL &&
 		    (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY));
 	} else					/* ucast w/o proxy */
 		return IEEE80211_ADDR_EQ(wh->i_addr3, vap->iv_myaddr);
 }
 
 /*
  * Verifies transmitter, updates lifetime, precursor list and forwards data.
  * > 0 means we have forwarded data and no need to process locally
  * == 0 means we want to process locally (and we may have forwarded data
  * < 0 means there was an error and data should be discarded
  */
 static int
 mesh_recv_indiv_data_to_fwrd(struct ieee80211vap *vap, struct mbuf *m,
     struct ieee80211_frame *wh, const struct ieee80211_meshcntl *mc)
 {
 	struct ieee80211_qosframe_addr4 *qwh;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt_meshda, *rt_meshsa;
 
 	/* This is called from the RX path - don't hold this lock */
 	IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic);
 
 	qwh = (struct ieee80211_qosframe_addr4 *)wh;
 
 	/*
 	 * TODO:
 	 * o verify addr2 is  a legitimate transmitter
 	 * o lifetime of precursor of addr3 (addr2) is max(init, curr)
 	 * o lifetime of precursor of addr4 (nexthop) is max(init, curr)
 	 */
 
 	/* set lifetime of addr3 (meshDA) to initial value */
 	rt_meshda = ieee80211_mesh_rt_find(vap, qwh->i_addr3);
 	if (rt_meshda == NULL) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, qwh->i_addr2,
 		    "no route to meshDA(%6D)", qwh->i_addr3, ":");
 		/*
 		 * [Optional] any of the following three actions:
 		 * o silently discard 				[X]
 		 * o trigger a path discovery			[ ]
 		 * o inform TA that meshDA is unknown.		[ ]
 		 */
 		/* XXX: stats */
 		return (-1);
 	}
 
 	ieee80211_mesh_rt_update(rt_meshda, ticks_to_msecs(
 	    ms->ms_ppath->mpp_inact));
 
 	/* set lifetime of addr4 (meshSA) to initial value */
 	rt_meshsa = ieee80211_mesh_rt_find(vap, qwh->i_addr4);
 	KASSERT(rt_meshsa != NULL, ("no route"));
 	ieee80211_mesh_rt_update(rt_meshsa, ticks_to_msecs(
 	    ms->ms_ppath->mpp_inact));
 
 	mesh_forward(vap, m, mc);
 	return (1); /* dont process locally */
 }
 
 /*
  * Verifies transmitter, updates lifetime, precursor list and process data
  * locally, if data is proxy with AE = 10 it could mean data should go
  * on another mesh path or data should be forwarded to the DS.
  *
  * > 0 means we have forwarded data and no need to process locally
  * == 0 means we want to process locally (and we may have forwarded data
  * < 0 means there was an error and data should be discarded
  */
 static int
 mesh_recv_indiv_data_to_me(struct ieee80211vap *vap, struct mbuf *m,
     struct ieee80211_frame *wh, const struct ieee80211_meshcntl *mc)
 {
 	struct ieee80211_qosframe_addr4 *qwh;
 	const struct ieee80211_meshcntl_ae10 *mc10;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_route *rt;
 	int ae;
 
 	/* This is called from the RX path - don't hold this lock */
 	IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic);
 
 	qwh = (struct ieee80211_qosframe_addr4 *)wh;
 	mc10 = (const struct ieee80211_meshcntl_ae10 *)mc;
 
 	/*
 	 * TODO:
 	 * o verify addr2 is  a legitimate transmitter
 	 * o lifetime of precursor entry is max(init, curr)
 	 */
 
 	/* set lifetime of addr4 (meshSA) to initial value */
 	rt = ieee80211_mesh_rt_find(vap, qwh->i_addr4);
 	KASSERT(rt != NULL, ("no route"));
 	ieee80211_mesh_rt_update(rt, ticks_to_msecs(ms->ms_ppath->mpp_inact));
 	rt = NULL;
 
 	ae = mc10->mc_flags & IEEE80211_MESH_AE_MASK;
 	KASSERT(ae == IEEE80211_MESH_AE_00 ||
 	    ae == IEEE80211_MESH_AE_10, ("bad AE %d", ae));
 	if (ae == IEEE80211_MESH_AE_10) {
 		if (IEEE80211_ADDR_EQ(mc10->mc_addr5, qwh->i_addr3)) {
 			return (0); /* process locally */
 		}
 
 		rt =  ieee80211_mesh_rt_find(vap, mc10->mc_addr5);
 		if (rt != NULL &&
 		    (rt->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) &&
 		    (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) == 0) {
 			/*
 			 * Forward on another mesh-path, according to
 			 * amendment as specified in 9.32.4.1
 			 */
 			IEEE80211_ADDR_COPY(qwh->i_addr3, mc10->mc_addr5);
 			mesh_forward(vap, m,
 			    (const struct ieee80211_meshcntl *)mc10);
 			return (1); /* dont process locally */
 		}
 		/*
 		 * All other cases: forward of MSDUs from the MBSS to DS indiv.
 		 * addressed according to 13.11.3.2.
 		 */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_OUTPUT, qwh->i_addr2,
 		    "forward frame to DS, SA(%6D) DA(%6D)",
 		    mc10->mc_addr6, ":", mc10->mc_addr5, ":");
 	}
 	return (0); /* process locally */
 }
 
 /*
  * Try to forward the group addressed data on to other mesh STAs, and
  * also to the DS.
  *
  * > 0 means we have forwarded data and no need to process locally
  * == 0 means we want to process locally (and we may have forwarded data
  * < 0 means there was an error and data should be discarded
  */
 static int
 mesh_recv_group_data(struct ieee80211vap *vap, struct mbuf *m,
     struct ieee80211_frame *wh, const struct ieee80211_meshcntl *mc)
 {
 #define	MC01(mc)	((const struct ieee80211_meshcntl_ae01 *)mc)
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	/* This is called from the RX path - don't hold this lock */
 	IEEE80211_TX_UNLOCK_ASSERT(vap->iv_ic);
 
 	mesh_forward(vap, m, mc);
 
 	if(mc->mc_ttl > 0) {
 		if (mc->mc_flags & IEEE80211_MESH_AE_01) {
 			/*
 			 * Forward of MSDUs from the MBSS to DS group addressed
 			 * (according to 13.11.3.2)
 			 * This happens by delivering the packet, and a bridge
 			 * will sent it on another port member.
 			 */
 			if (ms->ms_flags & IEEE80211_MESHFLAGS_GATE &&
 			    ms->ms_flags & IEEE80211_MESHFLAGS_FWD) {
 				IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH,
 				    MC01(mc)->mc_addr4, "%s",
 				    "forward from MBSS to the DS");
 			}
 		}
 	}
 	return (0); /* process locally */
 #undef	MC01
 }
 
 static int
 mesh_input(struct ieee80211_node *ni, struct mbuf *m,
     const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 #define	HAS_SEQ(type)	((type & 0x4) == 0)
 #define	MC01(mc)	((const struct ieee80211_meshcntl_ae01 *)mc)
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_frame *wh;
 	const struct ieee80211_meshcntl *mc;
 	int hdrspace, meshdrlen, need_tap, error;
 	uint8_t dir, type, subtype, ae;
 	uint32_t seq;
 	const uint8_t *addr;
 	uint8_t qos[2];
 
 	KASSERT(ni != NULL, ("null node"));
 	ni->ni_inact = ni->ni_inact_reload;
 
 	need_tap = 1;			/* mbuf need to be tapped. */
 	type = -1;			/* undefined */
 
 	/* This is called from the RX path - don't hold this lock */
 	IEEE80211_TX_UNLOCK_ASSERT(ic);
 
 	if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_min)) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL,
 		    "too short (1): len %u", m->m_pkthdr.len);
 		vap->iv_stats.is_rx_tooshort++;
 		goto out;
 	}
 	/*
 	 * Bit of a cheat here, we use a pointer for a 3-address
 	 * frame format but don't reference fields past outside
 	 * ieee80211_frame_min w/o first validating the data is
 	 * present.
 	*/
 	wh = mtod(m, struct ieee80211_frame *);
 
 	if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) !=
 	    IEEE80211_FC0_VERSION_0) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL, "wrong version %x", wh->i_fc[0]);
 		vap->iv_stats.is_rx_badversion++;
 		goto err;
 	}
 	dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 	type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK;
 	subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
 	if ((ic->ic_flags & IEEE80211_F_SCAN) == 0) {
 		IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi);
 		ni->ni_noise = nf;
 		if (HAS_SEQ(type)) {
 			uint8_t tid = ieee80211_gettid(wh);
 
 			if (IEEE80211_QOS_HAS_SEQ(wh) &&
 			    TID_TO_WME_AC(tid) >= WME_AC_VI)
 				ic->ic_wme.wme_hipri_traffic++;
 			if (! ieee80211_check_rxseq(ni, wh, wh->i_addr1, rxs))
 				goto out;
 		}
 	}
 #ifdef IEEE80211_DEBUG
 	/*
 	 * It's easier, but too expensive, to simulate different mesh
 	 * topologies by consulting the ACL policy very early, so do this
 	 * only under DEBUG.
 	 *
 	 * NB: this check is also done upon peering link initiation.
 	 */
 	if (vap->iv_acl != NULL && !vap->iv_acl->iac_check(vap, wh)) {
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ACL,
 		    wh, NULL, "%s", "disallowed by ACL");
 		vap->iv_stats.is_rx_acl++;
 		goto out;
 	}
 #endif
 	switch (type) {
 	case IEEE80211_FC0_TYPE_DATA:
 		if (ni == vap->iv_bss)
 			goto out;
 		if (ni->ni_mlstate != IEEE80211_NODE_MESH_ESTABLISHED) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH,
 			    ni->ni_macaddr, NULL,
 			    "peer link not yet established (%d)",
 			    ni->ni_mlstate);
 			vap->iv_stats.is_mesh_nolink++;
 			goto out;
 		}
 		if (dir != IEEE80211_FC1_DIR_FROMDS &&
 		    dir != IEEE80211_FC1_DIR_DSTODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto err;
 		}
 
 		/* All Mesh data frames are QoS subtype */
 		if (!HAS_SEQ(type)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "incorrect subtype 0x%x", subtype);
 			vap->iv_stats.is_rx_badsubtype++;
 			goto err;
 		}
 
 		/*
 		 * Next up, any fragmentation.
 		 * XXX: we defrag before we even try to forward,
 		 * Mesh Control field is not present in sub-sequent
 		 * fragmented frames. This is in contrast to Draft 4.0.
 		 */
 		hdrspace = ieee80211_hdrspace(ic, wh);
 		if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			m = ieee80211_defrag(ni, m, hdrspace, 0);
 			if (m == NULL) {
 				/* Fragment dropped or frame not complete yet */
 				goto out;
 			}
 		}
 		wh = mtod(m, struct ieee80211_frame *); /* NB: after defrag */
 
 		/*
 		 * Now we have a complete Mesh Data frame.
 		 */
 
 		/*
 		 * Only fromDStoDS data frames use 4 address qos frames
 		 * as specified in amendment. Otherwise addr4 is located
 		 * in the Mesh Control field and a 3 address qos frame
 		 * is used.
 		 */
 		*(uint16_t *)qos = *(uint16_t *)ieee80211_getqos(wh);
 
 		/*
 		 * NB: The mesh STA sets the Mesh Control Present
 		 * subfield to 1 in the Mesh Data frame containing
 		 * an unfragmented MSDU, an A-MSDU, or the first
 		 * fragment of an MSDU.
 		 * After defrag it should always be present.
 		 */
 		if (!(qos[1] & IEEE80211_QOS_MC)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH,
 			    ni->ni_macaddr, NULL,
 			    "%s", "Mesh control field not present");
 			vap->iv_stats.is_rx_elem_missing++; /* XXX: kinda */
 			goto err;
 		}
 
 		/* pull up enough to get to the mesh control */
 		if (m->m_len < hdrspace + sizeof(struct ieee80211_meshcntl) &&
 		    (m = m_pullup(m, hdrspace +
 		        sizeof(struct ieee80211_meshcntl))) == NULL) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, NULL,
 			    "data too short: expecting %u", hdrspace);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;		/* XXX */
 		}
 		/*
 		 * Now calculate the full extent of the headers. Note
 		 * mesh_decap will pull up anything we didn't get
 		 * above when it strips the 802.11 headers.
 		 */
 		mc = (const struct ieee80211_meshcntl *)
 		    (mtod(m, const uint8_t *) + hdrspace);
 		ae = mc->mc_flags & IEEE80211_MESH_AE_MASK;
 		meshdrlen = sizeof(struct ieee80211_meshcntl) +
 		    ae * IEEE80211_ADDR_LEN;
 		hdrspace += meshdrlen;
 
 		/* pull complete hdrspace = ieee80211_hdrspace + meshcontrol */
 		if ((meshdrlen > sizeof(struct ieee80211_meshcntl)) &&
 		    (m->m_len < hdrspace) &&
 		    ((m = m_pullup(m, hdrspace)) == NULL)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, NULL,
 			    "data too short: expecting %u", hdrspace);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;		/* XXX */
 		}
 		/* XXX: are we sure there is no reallocating after m_pullup? */
 
 		seq = le32dec(mc->mc_seq);
 		if (IEEE80211_IS_MULTICAST(wh->i_addr1))
 			addr = wh->i_addr3;
 		else if (ae == IEEE80211_MESH_AE_01)
 			addr = MC01(mc)->mc_addr4;
 		else
 			addr = ((struct ieee80211_qosframe_addr4 *)wh)->i_addr4;
 		if (IEEE80211_ADDR_EQ(vap->iv_myaddr, addr)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    addr, "data", "%s", "not to me");
 			vap->iv_stats.is_rx_wrongbss++;	/* XXX kinda */
 			goto out;
 		}
 		if (mesh_checkpseq(vap, addr, seq) != 0) {
 			vap->iv_stats.is_rx_dup++;
 			goto out;
 		}
 
 		/* This code "routes" the frame to the right control path */
 		if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			if (IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr3))
 				error =
 				    mesh_recv_indiv_data_to_me(vap, m, wh, mc);
 			else if (IEEE80211_IS_MULTICAST(wh->i_addr3))
 				error = mesh_recv_group_data(vap, m, wh, mc);
 			else
 				error = mesh_recv_indiv_data_to_fwrd(vap, m,
 				    wh, mc);
 		} else
 			error = mesh_recv_group_data(vap, m, wh, mc);
 		if (error < 0)
 			goto err;
 		else if (error > 0)
 			goto out;
 
 		if (ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		need_tap = 0;
 
 		/*
 		 * Finally, strip the 802.11 header.
 		 */
 		m = mesh_decap(vap, m, hdrspace, meshdrlen);
 		if (m == NULL) {
 			/* XXX mask bit to check for both */
 			/* don't count Null data frames as errors */
 			if (subtype == IEEE80211_FC0_SUBTYPE_NODATA ||
 			    subtype == IEEE80211_FC0_SUBTYPE_QOS_NULL)
 				goto out;
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "decap error");
 			vap->iv_stats.is_rx_decap++;
 			IEEE80211_NODE_STAT(ni, rx_decap);
 			goto err;
 		}
 		if (qos[0] & IEEE80211_QOS_AMSDU) {
 			m = ieee80211_decap_amsdu(ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 		}
 		ieee80211_deliver_data(vap, ni, m);
 		return type;
 	case IEEE80211_FC0_TYPE_MGT:
 		vap->iv_stats.is_rx_mgmt++;
 		IEEE80211_NODE_STAT(ni, rx_mgmt);
 		if (dir != IEEE80211_FC1_DIR_NODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "mgt", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto err;
 		}
 		if (m->m_pkthdr.len < sizeof(struct ieee80211_frame)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, "mgt", "too short: len %u",
 			    m->m_pkthdr.len);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;
 		}
 #ifdef IEEE80211_DEBUG
 		if ((ieee80211_msg_debug(vap) && 
 		    (vap->iv_ic->ic_flags & IEEE80211_F_SCAN)) ||
 		    ieee80211_msg_dumppkts(vap)) {
 			if_printf(ifp, "received %s from %s rssi %d\n",
 			    ieee80211_mgt_subtype_name(subtype),
 			    ether_sprintf(wh->i_addr2), rssi);
 		}
 #endif
 		if (IEEE80211_IS_PROTECTED(wh)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "WEP set but not permitted");
 			vap->iv_stats.is_rx_mgtdiscard++; /* XXX */
 			goto out;
 		}
 		vap->iv_recv_mgmt(ni, m, subtype, rxs, rssi, nf);
 		goto out;
 	case IEEE80211_FC0_TYPE_CTL:
 		vap->iv_stats.is_rx_ctl++;
 		IEEE80211_NODE_STAT(ni, rx_ctrl);
 		goto out;
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "bad", "frame type 0x%x", type);
 		/* should not come here */
 		break;
 	}
 err:
 	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 out:
 	if (m != NULL) {
 		if (need_tap && ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		m_freem(m);
 	}
 	return type;
 #undef	HAS_SEQ
 #undef	MC01
 }
 
 static void
 mesh_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0, int subtype,
     const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_channel *rxchan = ic->ic_curchan;
 	struct ieee80211_frame *wh;
 	struct ieee80211_mesh_route *rt;
 	uint8_t *frm, *efrm;
 
 	wh = mtod(m0, struct ieee80211_frame *);
 	frm = (uint8_t *)&wh[1];
 	efrm = mtod(m0, uint8_t *) + m0->m_len;
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 	case IEEE80211_FC0_SUBTYPE_BEACON:
 	{
 		struct ieee80211_scanparams scan;
 		struct ieee80211_channel *c;
 		/*
 		 * We process beacon/probe response
 		 * frames to discover neighbors.
 		 */
 		if (rxs != NULL) {
 			c = ieee80211_lookup_channel_rxstatus(vap, rxs);
 			if (c != NULL)
 				rxchan = c;
 		}
 		if (ieee80211_parse_beacon(ni, m0, rxchan, &scan) != 0)
 			return;
 		/*
 		 * Count frame now that we know it's to be processed.
 		 */
 		if (subtype == IEEE80211_FC0_SUBTYPE_BEACON) {
 			vap->iv_stats.is_rx_beacon++;	/* XXX remove */
 			IEEE80211_NODE_STAT(ni, rx_beacons);
 		} else
 			IEEE80211_NODE_STAT(ni, rx_proberesp);
 		/*
 		 * If scanning, just pass information to the scan module.
 		 */
 		if (ic->ic_flags & IEEE80211_F_SCAN) {
 			if (ic->ic_flags_ext & IEEE80211_FEXT_PROBECHAN) {
 				/*
 				 * Actively scanning a channel marked passive;
 				 * send a probe request now that we know there
 				 * is 802.11 traffic present.
 				 *
 				 * XXX check if the beacon we recv'd gives
 				 * us what we need and suppress the probe req
 				 */
 				ieee80211_probe_curchan(vap, 1);
 				ic->ic_flags_ext &= ~IEEE80211_FEXT_PROBECHAN;
 			}
 			ieee80211_add_scan(vap, rxchan, &scan, wh,
 			    subtype, rssi, nf);
 			return;
 		}
 
 		/* The rest of this code assumes we are running */
 		if (vap->iv_state != IEEE80211_S_RUN)
 			return;
 		/*
 		 * Ignore non-mesh STAs.
 		 */
 		if ((scan.capinfo &
 		     (IEEE80211_CAPINFO_ESS|IEEE80211_CAPINFO_IBSS)) ||
 		    scan.meshid == NULL || scan.meshconf == NULL) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "beacon", "%s", "not a mesh sta");
 			vap->iv_stats.is_mesh_wrongmesh++;
 			return;
 		}
 		/*
 		 * Ignore STAs for other mesh networks.
 		 */
 		if (memcmp(scan.meshid+2, ms->ms_id, ms->ms_idlen) != 0 ||
 		    mesh_verify_meshconf(vap, scan.meshconf)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "beacon", "%s", "not for our mesh");
 			vap->iv_stats.is_mesh_wrongmesh++;
 			return;
 		}
 		/*
 		 * Peer only based on the current ACL policy.
 		 */
 		if (vap->iv_acl != NULL && !vap->iv_acl->iac_check(vap, wh)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ACL,
 			    wh, NULL, "%s", "disallowed by ACL");
 			vap->iv_stats.is_rx_acl++;
 			return;
 		}
 		/*
 		 * Do neighbor discovery.
 		 */
 		if (!IEEE80211_ADDR_EQ(wh->i_addr2, ni->ni_macaddr)) {
 			/*
 			 * Create a new entry in the neighbor table.
 			 */
 			ni = ieee80211_add_neighbor(vap, wh, &scan);
 		}
 		/*
 		 * Automatically peer with discovered nodes if possible.
 		 */
 		if (ni != vap->iv_bss &&
 		    (ms->ms_flags & IEEE80211_MESHFLAGS_AP)) {
 			switch (ni->ni_mlstate) {
 			case IEEE80211_NODE_MESH_IDLE:
 			{
 				uint16_t args[1];
 
 				/* Wait for backoff callout to reset counter */
 				if (ni->ni_mlhcnt >= ieee80211_mesh_maxholding)
 					return;
 
 				ni->ni_mlpid = mesh_generateid(vap);
 				if (ni->ni_mlpid == 0)
 					return;
 				mesh_linkchange(ni, IEEE80211_NODE_MESH_OPENSNT);
 				args[0] = ni->ni_mlpid;
 				ieee80211_send_action(ni,
 				IEEE80211_ACTION_CAT_SELF_PROT,
 				IEEE80211_ACTION_MESHPEERING_OPEN, args);
 				ni->ni_mlrcnt = 0;
 				mesh_peer_timeout_setup(ni);
 				break;
 			}
 			case IEEE80211_NODE_MESH_ESTABLISHED:
 			{
 				/*
 				 * Valid beacon from a peer mesh STA
 				 * bump TA lifetime
 				 */
 				rt = ieee80211_mesh_rt_find(vap, wh->i_addr2);
 				if(rt != NULL) {
 					ieee80211_mesh_rt_update(rt,
 					    ticks_to_msecs(
 					    ms->ms_ppath->mpp_inact));
 				}
 				break;
 			}
 			default:
 				break; /* ignore */
 			}
 		}
 		break;
 	}
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 	{
 		uint8_t *ssid, *meshid, *rates, *xrates;
 
 		if (vap->iv_state != IEEE80211_S_RUN) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "wrong state %s",
 			    ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_rx_mgtdiscard++;
 			return;
 		}
 		if (IEEE80211_IS_MULTICAST(wh->i_addr2)) {
 			/* frame must be directed */
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "not unicast");
 			vap->iv_stats.is_rx_mgtdiscard++;	/* XXX stat */
 			return;
 		}
 		/*
 		 * prreq frame format
 		 *      [tlv] ssid
 		 *      [tlv] supported rates
 		 *      [tlv] extended supported rates
 		 *	[tlv] mesh id
 		 */
 		ssid = meshid = rates = xrates = NULL;
 		while (efrm - frm > 1) {
 			IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return);
 			switch (*frm) {
 			case IEEE80211_ELEMID_SSID:
 				ssid = frm;
 				break;
 			case IEEE80211_ELEMID_RATES:
 				rates = frm;
 				break;
 			case IEEE80211_ELEMID_XRATES:
 				xrates = frm;
 				break;
 			case IEEE80211_ELEMID_MESHID:
 				meshid = frm;
 				break;
 			}
 			frm += frm[1] + 2;
 		}
 		IEEE80211_VERIFY_ELEMENT(ssid, IEEE80211_NWID_LEN, return);
 		IEEE80211_VERIFY_ELEMENT(rates, IEEE80211_RATE_MAXSIZE, return);
 		if (xrates != NULL)
 			IEEE80211_VERIFY_ELEMENT(xrates,
 			    IEEE80211_RATE_MAXSIZE - rates[1], return);
 		if (meshid != NULL) {
 			IEEE80211_VERIFY_ELEMENT(meshid,
 			    IEEE80211_MESHID_LEN, return);
 			/* NB: meshid, not ssid */
 			IEEE80211_VERIFY_SSID(vap->iv_bss, meshid, return);
 		}
 
 		/* XXX find a better class or define it's own */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_INPUT, wh->i_addr2,
 		    "%s", "recv probe req");
 		/*
 		 * Some legacy 11b clients cannot hack a complete
 		 * probe response frame.  When the request includes
 		 * only a bare-bones rate set, communicate this to
 		 * the transmit side.
 		 */
 		ieee80211_send_proberesp(vap, wh->i_addr2, 0);
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_ACTION:
 	case IEEE80211_FC0_SUBTYPE_ACTION_NOACK:
 		if (ni == vap->iv_bss) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "unknown node");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (!IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr1) &&
 		    !IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "not for us");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (vap->iv_state != IEEE80211_S_RUN) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "wrong state %s",
 			    ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else {
 			if (ieee80211_parse_action(ni, m0) == 0)
 				(void)ic->ic_recv_action(ni, wh, frm, efrm);
 		}
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_TIMING_ADV:
 	case IEEE80211_FC0_SUBTYPE_ATIM:
 	case IEEE80211_FC0_SUBTYPE_DISASSOC:
 	case IEEE80211_FC0_SUBTYPE_AUTH:
 	case IEEE80211_FC0_SUBTYPE_DEAUTH:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 		    wh, NULL, "%s", "not handled");
 		vap->iv_stats.is_rx_mgtdiscard++;
 		break;
 
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "mgt", "subtype 0x%x not handled", subtype);
 		vap->iv_stats.is_rx_badsubtype++;
 		break;
 	}
 }
 
 static void
 mesh_recv_ctl(struct ieee80211_node *ni, struct mbuf *m, int subtype)
 {
 
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_BAR:
 		ieee80211_recv_bar(ni, m);
 		break;
 	}
 }
 
 /*
  * Parse meshpeering action ie's for MPM frames
  */
 static const struct ieee80211_meshpeer_ie *
 mesh_parse_meshpeering_action(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,	/* XXX for VERIFY_LENGTH */
 	const uint8_t *frm, const uint8_t *efrm,
 	struct ieee80211_meshpeer_ie *mp, uint8_t subtype)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	const struct ieee80211_meshpeer_ie *mpie;
 	uint16_t args[3];
 	const uint8_t *meshid, *meshconf;
 	uint8_t sendclose = 0; /* 1 = MPM frame rejected, close will be sent */
 
 	meshid = meshconf = NULL;
 	while (efrm - frm > 1) {
 		IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return NULL);
 		switch (*frm) {
 		case IEEE80211_ELEMID_MESHID:
 			meshid = frm;
 			break;
 		case IEEE80211_ELEMID_MESHCONF:
 			meshconf = frm;
 			break;
 		case IEEE80211_ELEMID_MESHPEER:
 			mpie = (const struct ieee80211_meshpeer_ie *) frm;
 			memset(mp, 0, sizeof(*mp));
 			mp->peer_len = mpie->peer_len;
 			mp->peer_proto = le16dec(&mpie->peer_proto);
 			mp->peer_llinkid = le16dec(&mpie->peer_llinkid);
 			switch (subtype) {
 			case IEEE80211_ACTION_MESHPEERING_CONFIRM:
 				mp->peer_linkid =
 				    le16dec(&mpie->peer_linkid);
 				break;
 			case IEEE80211_ACTION_MESHPEERING_CLOSE:
 				/* NB: peer link ID is optional */
 				if (mpie->peer_len ==
 				    (IEEE80211_MPM_BASE_SZ + 2)) {
 					mp->peer_linkid = 0;
 					mp->peer_rcode =
 					    le16dec(&mpie->peer_linkid);
 				} else {
 					mp->peer_linkid =
 					    le16dec(&mpie->peer_linkid);
 					mp->peer_rcode =
 					    le16dec(&mpie->peer_rcode);
 				}
 				break;
 			}
 			break;
 		}
 		frm += frm[1] + 2;
 	}
 
 	/*
 	 * Verify the contents of the frame.
 	 * If it fails validation, close the peer link.
 	 */
 	if (mesh_verify_meshpeer(vap, subtype, (const uint8_t *)mp)) {
 		sendclose = 1;
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    wh, NULL, "%s", "MPM validation failed");
 	}
 
 	/* If meshid is not the same reject any frames type. */
 	if (sendclose == 0 && mesh_verify_meshid(vap, meshid)) {
 		sendclose = 1;
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    wh, NULL, "%s", "not for our mesh");
 		if (subtype == IEEE80211_ACTION_MESHPEERING_CLOSE) {
 			/*
 			 * Standard not clear about this, if we dont ignore
 			 * there will be an endless loop between nodes sending
 			 * CLOSE frames between each other with wrong meshid.
 			 * Discard and timers will bring FSM to IDLE state.
 			 */
 			return NULL;
 		}
 	}
 
 	/*
 	 * Close frames are accepted if meshid is the same.
 	 * Verify the other two types.
 	 */
 	if (sendclose == 0 && subtype != IEEE80211_ACTION_MESHPEERING_CLOSE &&
 	    mesh_verify_meshconf(vap, meshconf)) {
 		sendclose = 1;
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    wh, NULL, "%s", "configuration mismatch");
 	}
 
 	if (sendclose) {
 		vap->iv_stats.is_rx_mgtdiscard++;
 		switch (ni->ni_mlstate) {
 		case IEEE80211_NODE_MESH_IDLE:
 		case IEEE80211_NODE_MESH_ESTABLISHED:
 		case IEEE80211_NODE_MESH_HOLDING:
 			/* ignore */
 			break;
 		case IEEE80211_NODE_MESH_OPENSNT:
 		case IEEE80211_NODE_MESH_OPENRCV:
 		case IEEE80211_NODE_MESH_CONFIRMRCV:
 			args[0] = ni->ni_mlpid;
 			args[1] = ni->ni_mllid;
 			/* Reason codes for rejection */
 			switch (subtype) {
 			case IEEE80211_ACTION_MESHPEERING_OPEN:
 				args[2] = IEEE80211_REASON_MESH_CPVIOLATION;
 				break;
 			case IEEE80211_ACTION_MESHPEERING_CONFIRM:
 				args[2] = IEEE80211_REASON_MESH_INCONS_PARAMS;
 				break;
 			}
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 			break;
 		}
 		return NULL;
 	}
 
 	return (const struct ieee80211_meshpeer_ie *) mp;
 }
 
 static int
 mesh_recv_action_meshpeering_open(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const uint8_t *frm, const uint8_t *efrm)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_meshpeer_ie ie;
 	const struct ieee80211_meshpeer_ie *meshpeer;
 	uint16_t args[3];
 
 	/* +2+2 for action + code + capabilites */
 	meshpeer = mesh_parse_meshpeering_action(ni, wh, frm+2+2, efrm, &ie,
 	    IEEE80211_ACTION_MESHPEERING_OPEN);
 	if (meshpeer == NULL) {
 		return 0;
 	}
 
 	/* XXX move up */
 	IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni,
 	    "recv PEER OPEN, lid 0x%x", meshpeer->peer_llinkid);
 
 	switch (ni->ni_mlstate) {
 	case IEEE80211_NODE_MESH_IDLE:
 		/* Reject open request if reached our maximum neighbor count */
 		if (ms->ms_neighbors >= IEEE80211_MESH_MAX_NEIGHBORS) {
 			args[0] = meshpeer->peer_llinkid;
 			args[1] = 0;
 			args[2] = IEEE80211_REASON_MESH_MAX_PEERS;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			/* stay in IDLE state */
 			return (0);
 		}
 		/* Open frame accepted */
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_OPENRCV);
 		ni->ni_mllid = meshpeer->peer_llinkid;
 		ni->ni_mlpid = mesh_generateid(vap);
 		if (ni->ni_mlpid == 0)
 			return 0;		/* XXX */
 		args[0] = ni->ni_mlpid;
 		/* Announce we're open too... */
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_OPEN, args);
 		/* ...and confirm the link. */
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args);
 		mesh_peer_timeout_setup(ni);
 		break;
 	case IEEE80211_NODE_MESH_OPENRCV:
 		/* Wrong Link ID */
 		if (ni->ni_mllid != meshpeer->peer_llinkid) {
 			args[0] = ni->ni_mllid;
 			args[1] = ni->ni_mlpid;
 			args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 			break;
 		}
 		/* Duplicate open, confirm again. */
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args);
 		break;
 	case IEEE80211_NODE_MESH_OPENSNT:
 		ni->ni_mllid = meshpeer->peer_llinkid;
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_OPENRCV);
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args);
 		/* NB: don't setup/clear any timeout */
 		break;
 	case IEEE80211_NODE_MESH_CONFIRMRCV:
 		if (ni->ni_mlpid != meshpeer->peer_linkid ||
 		    ni->ni_mllid != meshpeer->peer_llinkid) {
 			args[0] = ni->ni_mlpid;
 			args[1] = ni->ni_mllid;
 			args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			mesh_linkchange(ni,
 			    IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 			break;
 		}
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_ESTABLISHED);
 		ni->ni_mllid = meshpeer->peer_llinkid;
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args);
 		mesh_peer_timeout_stop(ni);
 		break;
 	case IEEE80211_NODE_MESH_ESTABLISHED:
 		if (ni->ni_mllid != meshpeer->peer_llinkid) {
 			args[0] = ni->ni_mllid;
 			args[1] = ni->ni_mlpid;
 			args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 			break;
 		}
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args);
 		break;
 	case IEEE80211_NODE_MESH_HOLDING:
 		args[0] = ni->ni_mlpid;
 		args[1] = meshpeer->peer_llinkid;
 		/* Standard not clear about what the reaason code should be */
 		args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE,
 		    args);
 		break;
 	}
 	return 0;
 }
 
 static int
 mesh_recv_action_meshpeering_confirm(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const uint8_t *frm, const uint8_t *efrm)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_meshpeer_ie ie;
 	const struct ieee80211_meshpeer_ie *meshpeer;
 	uint16_t args[3];
 
 	/* +2+2+2+2 for action + code + capabilites + status code + AID */
 	meshpeer = mesh_parse_meshpeering_action(ni, wh, frm+2+2+2+2, efrm, &ie,
 	    IEEE80211_ACTION_MESHPEERING_CONFIRM);
 	if (meshpeer == NULL) {
 		return 0;
 	}
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni,
 	    "recv PEER CONFIRM, local id 0x%x, peer id 0x%x",
 	    meshpeer->peer_llinkid, meshpeer->peer_linkid);
 
 	switch (ni->ni_mlstate) {
 	case IEEE80211_NODE_MESH_OPENRCV:
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_ESTABLISHED);
 		mesh_peer_timeout_stop(ni);
 		break;
 	case IEEE80211_NODE_MESH_OPENSNT:
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_CONFIRMRCV);
 		mesh_peer_timeout_setup(ni);
 		break;
 	case IEEE80211_NODE_MESH_HOLDING:
 		args[0] = ni->ni_mlpid;
 		args[1] = meshpeer->peer_llinkid;
 		/* Standard not clear about what the reaason code should be */
 		args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE,
 		    args);
 		break;
 	case IEEE80211_NODE_MESH_CONFIRMRCV:
 		if (ni->ni_mllid != meshpeer->peer_llinkid) {
 			args[0] = ni->ni_mlpid;
 			args[1] = ni->ni_mllid;
 			args[2] = IEEE80211_REASON_PEER_LINK_CANCELED;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE,
 			    args);
 			mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 		}
 		break;
 	default:
 		IEEE80211_DISCARD(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    wh, NULL, "received confirm in invalid state %d",
 		    ni->ni_mlstate);
 		vap->iv_stats.is_rx_mgtdiscard++;
 		break;
 	}
 	return 0;
 }
 
 static int
 mesh_recv_action_meshpeering_close(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const uint8_t *frm, const uint8_t *efrm)
 {
 	struct ieee80211_meshpeer_ie ie;
 	const struct ieee80211_meshpeer_ie *meshpeer;
 	uint16_t args[3];
 
 	/* +2 for action + code */
 	meshpeer = mesh_parse_meshpeering_action(ni, wh, frm+2, efrm, &ie,
 	    IEEE80211_ACTION_MESHPEERING_CLOSE);
 	if (meshpeer == NULL) {
 		return 0;
 	}
 
 	/*
 	 * XXX: check reason code, for example we could receive
 	 * IEEE80211_REASON_MESH_MAX_PEERS then we should not attempt
 	 * to peer again.
 	 */
 
 	IEEE80211_NOTE(ni->ni_vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 	    ni, "%s", "recv PEER CLOSE");
 
 	switch (ni->ni_mlstate) {
 	case IEEE80211_NODE_MESH_IDLE:
 		/* ignore */
 		break;
 	case IEEE80211_NODE_MESH_OPENRCV:
 	case IEEE80211_NODE_MESH_OPENSNT:
 	case IEEE80211_NODE_MESH_CONFIRMRCV:
 	case IEEE80211_NODE_MESH_ESTABLISHED:
 		args[0] = ni->ni_mlpid;
 		args[1] = ni->ni_mllid;
 		args[2] = IEEE80211_REASON_MESH_CLOSE_RCVD;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE,
 		    args);
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 		mesh_peer_timeout_setup(ni);
 		break;
 	case IEEE80211_NODE_MESH_HOLDING:
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_IDLE);
 		mesh_peer_timeout_stop(ni);
 		break;
 	}
 	return 0;
 }
 
 /*
  * Link Metric handling.
  */
 static int
 mesh_recv_action_meshlmetric(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const uint8_t *frm, const uint8_t *efrm)
 {
 	const struct ieee80211_meshlmetric_ie *ie =
 	    (const struct ieee80211_meshlmetric_ie *)
 	    (frm+2); /* action + code */
 	struct ieee80211_meshlmetric_ie lm_rep;
 
 	if (ie->lm_flags & IEEE80211_MESH_LMETRIC_FLAGS_REQ) {
 		lm_rep.lm_flags = 0;
 		lm_rep.lm_metric = mesh_airtime_calc(ni);
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_MESH,
 		    IEEE80211_ACTION_MESH_LMETRIC,
 		    &lm_rep);
 	}
 	/* XXX: else do nothing for now */
 	return 0;
 }
 
 /*
  * Parse meshgate action ie's for GANN frames.
  * Returns -1 if parsing fails, otherwise 0.
  */
 static int
 mesh_parse_meshgate_action(struct ieee80211_node *ni,
     const struct ieee80211_frame *wh,	/* XXX for VERIFY_LENGTH */
     struct ieee80211_meshgann_ie *ie, const uint8_t *frm, const uint8_t *efrm)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	const struct ieee80211_meshgann_ie *gannie;
 
 	while (efrm - frm > 1) {
 		IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return -1);
 		switch (*frm) {
 		case IEEE80211_ELEMID_MESHGANN:
 			gannie = (const struct ieee80211_meshgann_ie *) frm;
 			memset(ie, 0, sizeof(*ie));
 			ie->gann_ie = gannie->gann_ie;
 			ie->gann_len = gannie->gann_len;
 			ie->gann_flags = gannie->gann_flags;
 			ie->gann_hopcount = gannie->gann_hopcount;
 			ie->gann_ttl = gannie->gann_ttl;
 			IEEE80211_ADDR_COPY(ie->gann_addr, gannie->gann_addr);
 			ie->gann_seq = le32dec(&gannie->gann_seq);
 			ie->gann_interval = le16dec(&gannie->gann_interval);
 			break;
 		}
 		frm += frm[1] + 2;
 	}
 
 	return 0;
 }
 
 /*
  * Mesh Gate Announcement handling.
  */
 static int
 mesh_recv_action_meshgate(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const uint8_t *frm, const uint8_t *efrm)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_mesh_gate_route *gr, *next;
 	struct ieee80211_mesh_route *rt_gate;
 	struct ieee80211_meshgann_ie pgann;
 	struct ieee80211_meshgann_ie ie;
 	int found = 0;
 
 	/* +2 for action + code */
 	if (mesh_parse_meshgate_action(ni, wh, &ie, frm+2, efrm) != 0) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH,
 		    ni->ni_macaddr, NULL, "%s",
 		    "GANN parsing failed");
 		vap->iv_stats.is_rx_mgtdiscard++;
 		return (0);
 	}
 
 	if (IEEE80211_ADDR_EQ(vap->iv_myaddr, ie.gann_addr))
 		return 0;
 
 	IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, ni->ni_macaddr,
 	    "received GANN, meshgate: %6D (seq %u)", ie.gann_addr, ":",
 	    ie.gann_seq);
 
 	if (ms == NULL)
 		return (0);
 	MESH_RT_LOCK(ms);
 	TAILQ_FOREACH_SAFE(gr, &ms->ms_known_gates, gr_next, next) {
 		if (!IEEE80211_ADDR_EQ(gr->gr_addr, ie.gann_addr))
 			continue;
 		if (ie.gann_seq <= gr->gr_lastseq) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_MESH,
 			    ni->ni_macaddr, NULL,
 			    "GANN old seqno %u <= %u",
 			    ie.gann_seq, gr->gr_lastseq);
 			MESH_RT_UNLOCK(ms);
 			return (0);
 		}
 		/* corresponding mesh gate found & GANN accepted */
 		found = 1;
 		break;
 	}
 	if (found == 0) {
 		/* this GANN is from a new mesh Gate add it to known table. */
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, ie.gann_addr,
 		    "stored new GANN information, seq %u.", ie.gann_seq);
 		gr = IEEE80211_MALLOC(ALIGN(sizeof(struct ieee80211_mesh_gate_route)),
 		    M_80211_MESH_GT_RT,
 		    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 		IEEE80211_ADDR_COPY(gr->gr_addr, ie.gann_addr);
 		TAILQ_INSERT_TAIL(&ms->ms_known_gates, gr, gr_next);
 	}
 	gr->gr_lastseq = ie.gann_seq;
 
 	/* check if we have a path to this gate */
 	rt_gate = mesh_rt_find_locked(ms, gr->gr_addr);
 	if (rt_gate != NULL &&
 	    rt_gate->rt_flags & IEEE80211_MESHRT_FLAGS_VALID) {
 		gr->gr_route = rt_gate;
 		rt_gate->rt_flags |= IEEE80211_MESHRT_FLAGS_GATE;
 	}
 
 	MESH_RT_UNLOCK(ms);
 
 	/* popagate only if decremented ttl >= 1 && forwarding is enabled */
 	if ((ie.gann_ttl - 1) < 1 && !(ms->ms_flags & IEEE80211_MESHFLAGS_FWD))
 		return 0;
 	pgann.gann_flags = ie.gann_flags; /* Reserved */
 	pgann.gann_hopcount = ie.gann_hopcount + 1;
 	pgann.gann_ttl = ie.gann_ttl - 1;
 	IEEE80211_ADDR_COPY(pgann.gann_addr, ie.gann_addr);
 	pgann.gann_seq = ie.gann_seq;
 	pgann.gann_interval = ie.gann_interval;
 
 	IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_MESH, ie.gann_addr,
 	    "%s", "propagate GANN");
 
 	ieee80211_send_action(vap->iv_bss, IEEE80211_ACTION_CAT_MESH,
 	    IEEE80211_ACTION_MESH_GANN, &pgann);
 
 	return 0;
 }
 
 static int
 mesh_send_action(struct ieee80211_node *ni,
     const uint8_t sa[IEEE80211_ADDR_LEN],
     const uint8_t da[IEEE80211_ADDR_LEN],
     struct mbuf *m)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_bpf_params params;
 	int ret;
 
 	KASSERT(ni != NULL, ("null node"));
 
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT, ni,
 		    "block %s frame in CAC state", "Mesh action");
 		vap->iv_stats.is_tx_badstate++;
 		ieee80211_free_node(ni);
 		m_freem(m);
 		return EIO;		/* XXX */
 	}
 
 	M_PREPEND(m, sizeof(struct ieee80211_frame), IEEE80211_M_NOWAIT);
 	if (m == NULL) {
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 
 	IEEE80211_TX_LOCK(ic);
 	ieee80211_send_setup(ni, m,
 	     IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_ACTION,
 	     IEEE80211_NONQOS_TID, sa, da, sa);
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	memset(&params, 0, sizeof(params));
 	params.ibp_pri = WME_AC_VO;
 	params.ibp_rate0 = ni->ni_txparms->mgmtrate;
 	if (IEEE80211_IS_MULTICAST(da))
 		params.ibp_try0 = 1;
 	else
 		params.ibp_try0 = ni->ni_txparms->maxretry;
 	params.ibp_power = ni->ni_txpower;
 
 	IEEE80211_NODE_STAT(ni, tx_mgmt);
 
 	ret = ieee80211_raw_output(vap, ni, m, &params);
 	IEEE80211_TX_UNLOCK(ic);
 	return (ret);
 }
 
 #define	ADDSHORT(frm, v) do {			\
 	frm[0] = (v) & 0xff;			\
 	frm[1] = (v) >> 8;			\
 	frm += 2;				\
 } while (0)
 #define	ADDWORD(frm, v) do {			\
 	frm[0] = (v) & 0xff;			\
 	frm[1] = ((v) >> 8) & 0xff;		\
 	frm[2] = ((v) >> 16) & 0xff;		\
 	frm[3] = ((v) >> 24) & 0xff;		\
 	frm += 4;				\
 } while (0)
 
 static int
 mesh_send_action_meshpeering_open(struct ieee80211_node *ni,
 	int category, int action, void *args0)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	uint16_t *args = args0;
 	const struct ieee80211_rateset *rs;
 	struct mbuf *m;
 	uint8_t *frm;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni,
 	    "send PEER OPEN action: localid 0x%x", args[0]);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__,
 	    ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame),
 	    sizeof(uint16_t)	/* action+category */
 	    + sizeof(uint16_t)	/* capabilites */
 	    + 2 + IEEE80211_RATE_SIZE
 	    + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 	    + 2 + IEEE80211_MESHID_LEN
 	    + sizeof(struct ieee80211_meshconf_ie)
 	    + sizeof(struct ieee80211_meshpeer_ie)
 	);
 	if (m != NULL) {
 		/*
 		 * mesh peer open action frame format:
 		 *   [1] category
 		 *   [1] action
 		 *   [2] capabilities
 		 *   [tlv] rates
 		 *   [tlv] xrates
 		 *   [tlv] mesh id
 		 *   [tlv] mesh conf
 		 *   [tlv] mesh peer link mgmt
 		 */
 		*frm++ = category;
 		*frm++ = action;
 		ADDSHORT(frm, ieee80211_getcapinfo(vap, ni->ni_chan));
 		rs = ieee80211_get_suprates(ic, ic->ic_curchan);
 		frm = ieee80211_add_rates(frm, rs);
 		frm = ieee80211_add_xrates(frm, rs);
 		frm = ieee80211_add_meshid(frm, vap);
 		frm = ieee80211_add_meshconf(frm, vap);
 		frm = ieee80211_add_meshpeer(frm, IEEE80211_ACTION_MESHPEERING_OPEN,
 		    args[0], 0, 0);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m);
 	} else {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 }
 
 static int
 mesh_send_action_meshpeering_confirm(struct ieee80211_node *ni,
 	int category, int action, void *args0)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	uint16_t *args = args0;
 	const struct ieee80211_rateset *rs;
 	struct mbuf *m;
 	uint8_t *frm;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni,
 	    "send PEER CONFIRM action: localid 0x%x, peerid 0x%x",
 	    args[0], args[1]);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__,
 	    ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame),
 	    sizeof(uint16_t)	/* action+category */
 	    + sizeof(uint16_t)	/* capabilites */
 	    + sizeof(uint16_t)	/* status code */
 	    + sizeof(uint16_t)	/* AID */
 	    + 2 + IEEE80211_RATE_SIZE
 	    + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 	    + 2 + IEEE80211_MESHID_LEN
 	    + sizeof(struct ieee80211_meshconf_ie)
 	    + sizeof(struct ieee80211_meshpeer_ie)
 	);
 	if (m != NULL) {
 		/*
 		 * mesh peer confirm action frame format:
 		 *   [1] category
 		 *   [1] action
 		 *   [2] capabilities
 		 *   [2] status code
 		 *   [2] association id (peer ID)
 		 *   [tlv] rates
 		 *   [tlv] xrates
 		 *   [tlv] mesh id
 		 *   [tlv] mesh conf
 		 *   [tlv] mesh peer link mgmt
 		 */
 		*frm++ = category;
 		*frm++ = action;
 		ADDSHORT(frm, ieee80211_getcapinfo(vap, ni->ni_chan));
 		ADDSHORT(frm, 0);		/* status code */
 		ADDSHORT(frm, args[1]);		/* AID */
 		rs = ieee80211_get_suprates(ic, ic->ic_curchan);
 		frm = ieee80211_add_rates(frm, rs);
 		frm = ieee80211_add_xrates(frm, rs);
 		frm = ieee80211_add_meshid(frm, vap);
 		frm = ieee80211_add_meshconf(frm, vap);
 		frm = ieee80211_add_meshpeer(frm,
 		    IEEE80211_ACTION_MESHPEERING_CONFIRM,
 		    args[0], args[1], 0);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m);
 	} else {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 }
 
 static int
 mesh_send_action_meshpeering_close(struct ieee80211_node *ni,
 	int category, int action, void *args0)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	uint16_t *args = args0;
 	struct mbuf *m;
 	uint8_t *frm;
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH, ni,
 	    "send PEER CLOSE action: localid 0x%x, peerid 0x%x reason %d (%s)",
 	    args[0], args[1], args[2], ieee80211_reason_to_string(args[2]));
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__,
 	    ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame),
 	    sizeof(uint16_t)	/* action+category */
 	    + sizeof(uint16_t)	/* reason code */
 	    + 2 + IEEE80211_MESHID_LEN
 	    + sizeof(struct ieee80211_meshpeer_ie)
 	);
 	if (m != NULL) {
 		/*
 		 * mesh peer close action frame format:
 		 *   [1] category
 		 *   [1] action
 		 *   [tlv] mesh id
 		 *   [tlv] mesh peer link mgmt
 		 */
 		*frm++ = category;
 		*frm++ = action;
 		frm = ieee80211_add_meshid(frm, vap);
 		frm = ieee80211_add_meshpeer(frm,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE,
 		    args[0], args[1], args[2]);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m);
 	} else {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 }
 
 static int
 mesh_send_action_meshlmetric(struct ieee80211_node *ni,
 	int category, int action, void *arg0)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_meshlmetric_ie *ie = arg0;
 	struct mbuf *m;
 	uint8_t *frm;
 
 	if (ie->lm_flags & IEEE80211_MESH_LMETRIC_FLAGS_REQ) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    ni, "%s", "send LINK METRIC REQUEST action");
 	} else {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    ni, "send LINK METRIC REPLY action: metric 0x%x",
 		    ie->lm_metric);
 	}
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__,
 	    ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame),
 	    sizeof(uint16_t) +	/* action+category */
 	    sizeof(struct ieee80211_meshlmetric_ie)
 	);
 	if (m != NULL) {
 		/*
 		 * mesh link metric
 		 *   [1] category
 		 *   [1] action
 		 *   [tlv] mesh link metric
 		 */
 		*frm++ = category;
 		*frm++ = action;
 		frm = ieee80211_add_meshlmetric(frm,
 		    ie->lm_flags, ie->lm_metric);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		return mesh_send_action(ni, vap->iv_myaddr, ni->ni_macaddr, m);
 	} else {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 }
 
 static int
 mesh_send_action_meshgate(struct ieee80211_node *ni,
 	int category, int action, void *arg0)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_meshgann_ie *ie = arg0;
 	struct mbuf *m;
 	uint8_t *frm;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n", __func__, __LINE__,
 	    ni, ether_sprintf(ni->ni_macaddr), ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame),
 	    sizeof(uint16_t) +	/* action+category */
 	    IEEE80211_MESHGANN_BASE_SZ
 	);
 	if (m != NULL) {
 		/*
 		 * mesh link metric
 		 *   [1] category
 		 *   [1] action
 		 *   [tlv] mesh gate announcement
 		 */
 		*frm++ = category;
 		*frm++ = action;
 		frm = ieee80211_add_meshgate(frm, ie);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		return mesh_send_action(ni, vap->iv_myaddr, broadcastaddr, m);
 	} else {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 }
 
 static void
 mesh_peer_timeout_setup(struct ieee80211_node *ni)
 {
 	switch (ni->ni_mlstate) {
 	case IEEE80211_NODE_MESH_HOLDING:
 		ni->ni_mltval = ieee80211_mesh_holdingtimeout;
 		break;
 	case IEEE80211_NODE_MESH_CONFIRMRCV:
 		ni->ni_mltval = ieee80211_mesh_confirmtimeout;
 		break;
 	case IEEE80211_NODE_MESH_IDLE:
 		ni->ni_mltval = 0;
 		break;
 	default:
 		ni->ni_mltval = ieee80211_mesh_retrytimeout;
 		break;
 	}
 	if (ni->ni_mltval)
 		callout_reset(&ni->ni_mltimer, ni->ni_mltval,
 		    mesh_peer_timeout_cb, ni);
 }
 
 /*
  * Same as above but backoffs timer statisically 50%.
  */
 static void
 mesh_peer_timeout_backoff(struct ieee80211_node *ni)
 {
 	uint32_t r;
 
 	r = arc4random();
 	ni->ni_mltval += r % ni->ni_mltval;
 	callout_reset(&ni->ni_mltimer, ni->ni_mltval, mesh_peer_timeout_cb,
 	    ni);
 }
 
 static __inline void
 mesh_peer_timeout_stop(struct ieee80211_node *ni)
 {
 	callout_drain(&ni->ni_mltimer);
 }
 
 static void
 mesh_peer_backoff_cb(void *arg)
 {
 	struct ieee80211_node *ni = (struct ieee80211_node *)arg;
 
 	/* After backoff timeout, try to peer automatically again. */
 	ni->ni_mlhcnt = 0;
 }
 
 /*
  * Mesh Peer Link Management FSM timeout handling.
  */
 static void
 mesh_peer_timeout_cb(void *arg)
 {
 	struct ieee80211_node *ni = (struct ieee80211_node *)arg;
 	uint16_t args[3];
 
 	IEEE80211_NOTE(ni->ni_vap, IEEE80211_MSG_MESH,
 	    ni, "mesh link timeout, state %d, retry counter %d",
 	    ni->ni_mlstate, ni->ni_mlrcnt);
 
 	switch (ni->ni_mlstate) {
 	case IEEE80211_NODE_MESH_IDLE:
 	case IEEE80211_NODE_MESH_ESTABLISHED:
 		break;
 	case IEEE80211_NODE_MESH_OPENSNT:
 	case IEEE80211_NODE_MESH_OPENRCV:
 		if (ni->ni_mlrcnt == ieee80211_mesh_maxretries) {
 			args[0] = ni->ni_mlpid;
 			args[2] = IEEE80211_REASON_MESH_MAX_RETRIES;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_CLOSE, args);
 			ni->ni_mlrcnt = 0;
 			mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 			mesh_peer_timeout_setup(ni);
 		} else {
 			args[0] = ni->ni_mlpid;
 			ieee80211_send_action(ni,
 			    IEEE80211_ACTION_CAT_SELF_PROT,
 			    IEEE80211_ACTION_MESHPEERING_OPEN, args);
 			ni->ni_mlrcnt++;
 			mesh_peer_timeout_backoff(ni);
 		}
 		break;
 	case IEEE80211_NODE_MESH_CONFIRMRCV:
 		args[0] = ni->ni_mlpid;
 		args[2] = IEEE80211_REASON_MESH_CONFIRM_TIMEOUT;
 		ieee80211_send_action(ni,
 		    IEEE80211_ACTION_CAT_SELF_PROT,
 		    IEEE80211_ACTION_MESHPEERING_CLOSE, args);
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_HOLDING);
 		mesh_peer_timeout_setup(ni);
 		break;
 	case IEEE80211_NODE_MESH_HOLDING:
 		ni->ni_mlhcnt++;
 		if (ni->ni_mlhcnt >= ieee80211_mesh_maxholding)
 			callout_reset(&ni->ni_mlhtimer,
 			    ieee80211_mesh_backofftimeout,
 			    mesh_peer_backoff_cb, ni);
 		mesh_linkchange(ni, IEEE80211_NODE_MESH_IDLE);
 		break;
 	}
 }
 
 static int
 mesh_verify_meshid(struct ieee80211vap *vap, const uint8_t *ie)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	if (ie == NULL || ie[1] != ms->ms_idlen)
 		return 1;
 	return memcmp(ms->ms_id, ie + 2, ms->ms_idlen);
 }
 
 /*
  * Check if we are using the same algorithms for this mesh.
  */
 static int
 mesh_verify_meshconf(struct ieee80211vap *vap, const uint8_t *ie)
 {
 	const struct ieee80211_meshconf_ie *meshconf =
 	    (const struct ieee80211_meshconf_ie *) ie;
 	const struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	if (meshconf == NULL)
 		return 1;
 	if (meshconf->conf_pselid != ms->ms_ppath->mpp_ie) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "unknown path selection algorithm: 0x%x\n",
 		    meshconf->conf_pselid);
 		return 1;
 	}
 	if (meshconf->conf_pmetid != ms->ms_pmetric->mpm_ie) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "unknown path metric algorithm: 0x%x\n",
 		    meshconf->conf_pmetid);
 		return 1;
 	}
 	if (meshconf->conf_ccid != 0) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "unknown congestion control algorithm: 0x%x\n",
 		    meshconf->conf_ccid);
 		return 1;
 	}
 	if (meshconf->conf_syncid != IEEE80211_MESHCONF_SYNC_NEIGHOFF) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "unknown sync algorithm: 0x%x\n",
 		    meshconf->conf_syncid);
 		return 1;
 	}
 	if (meshconf->conf_authid != 0) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "unknown auth auth algorithm: 0x%x\n",
 		    meshconf->conf_pselid);
 		return 1;
 	}
 	/* Not accepting peers */
 	if (!(meshconf->conf_cap & IEEE80211_MESHCONF_CAP_AP)) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_MESH,
 		    "not accepting peers: 0x%x\n", meshconf->conf_cap);
 		return 1;
 	}
 	return 0;
 }
 
 static int
 mesh_verify_meshpeer(struct ieee80211vap *vap, uint8_t subtype,
     const uint8_t *ie)
 {
 	const struct ieee80211_meshpeer_ie *meshpeer =
 	    (const struct ieee80211_meshpeer_ie *) ie;
 
 	if (meshpeer == NULL ||
 	    meshpeer->peer_len < IEEE80211_MPM_BASE_SZ ||
 	    meshpeer->peer_len > IEEE80211_MPM_MAX_SZ)
 		return 1;
 	if (meshpeer->peer_proto != IEEE80211_MPPID_MPM) {
 		IEEE80211_DPRINTF(vap,
 		    IEEE80211_MSG_ACTION | IEEE80211_MSG_MESH,
 		    "Only MPM protocol is supported (proto: 0x%02X)",
 		    meshpeer->peer_proto);
 		return 1;
 	}
 	switch (subtype) {
 	case IEEE80211_ACTION_MESHPEERING_OPEN:
 		if (meshpeer->peer_len != IEEE80211_MPM_BASE_SZ)
 			return 1;
 		break;
 	case IEEE80211_ACTION_MESHPEERING_CONFIRM:
 		if (meshpeer->peer_len != IEEE80211_MPM_BASE_SZ + 2)
 			return 1;
 		break;
 	case IEEE80211_ACTION_MESHPEERING_CLOSE:
 		if (meshpeer->peer_len < IEEE80211_MPM_BASE_SZ + 2)
 			return 1;
 		if (meshpeer->peer_len == (IEEE80211_MPM_BASE_SZ + 2) &&
 		    meshpeer->peer_linkid != 0)
 			return 1;
 		if (meshpeer->peer_rcode == 0)
 			return 1;
 		break;
 	}
 	return 0;
 }
 
 /*
  * Add a Mesh ID IE to a frame.
  */
 uint8_t *
 ieee80211_add_meshid(uint8_t *frm, struct ieee80211vap *vap)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	KASSERT(vap->iv_opmode == IEEE80211_M_MBSS, ("not a mbss vap"));
 
 	*frm++ = IEEE80211_ELEMID_MESHID;
 	*frm++ = ms->ms_idlen;
 	memcpy(frm, ms->ms_id, ms->ms_idlen);
 	return frm + ms->ms_idlen;
 }
 
 /*
  * Add a Mesh Configuration IE to a frame.
  * For now just use HWMP routing, Airtime link metric, Null Congestion
  * Signaling, Null Sync Protocol and Null Authentication.
  */
 uint8_t *
 ieee80211_add_meshconf(uint8_t *frm, struct ieee80211vap *vap)
 {
 	const struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	uint16_t caps;
 
 	KASSERT(vap->iv_opmode == IEEE80211_M_MBSS, ("not a MBSS vap"));
 
 	*frm++ = IEEE80211_ELEMID_MESHCONF;
 	*frm++ = IEEE80211_MESH_CONF_SZ;
 	*frm++ = ms->ms_ppath->mpp_ie;		/* path selection */
 	*frm++ = ms->ms_pmetric->mpm_ie;	/* link metric */
 	*frm++ = IEEE80211_MESHCONF_CC_DISABLED;
 	*frm++ = IEEE80211_MESHCONF_SYNC_NEIGHOFF;
 	*frm++ = IEEE80211_MESHCONF_AUTH_DISABLED;
 	/* NB: set the number of neighbors before the rest */
 	*frm = (ms->ms_neighbors > IEEE80211_MESH_MAX_NEIGHBORS ?
 	    IEEE80211_MESH_MAX_NEIGHBORS : ms->ms_neighbors) << 1;
 	if (ms->ms_flags & IEEE80211_MESHFLAGS_GATE)
 		*frm |= IEEE80211_MESHCONF_FORM_GATE;
 	frm += 1;
 	caps = 0;
 	if (ms->ms_flags & IEEE80211_MESHFLAGS_AP)
 		caps |= IEEE80211_MESHCONF_CAP_AP;
 	if (ms->ms_flags & IEEE80211_MESHFLAGS_FWD)
 		caps |= IEEE80211_MESHCONF_CAP_FWRD;
 	*frm++ = caps;
 	return frm;
 }
 
 /*
  * Add a Mesh Peer Management IE to a frame.
  */
 uint8_t *
 ieee80211_add_meshpeer(uint8_t *frm, uint8_t subtype, uint16_t localid,
     uint16_t peerid, uint16_t reason)
 {
 
 	KASSERT(localid != 0, ("localid == 0"));
 
 	*frm++ = IEEE80211_ELEMID_MESHPEER;
 	switch (subtype) {
 	case IEEE80211_ACTION_MESHPEERING_OPEN:
 		*frm++ = IEEE80211_MPM_BASE_SZ;		/* length */
 		ADDSHORT(frm, IEEE80211_MPPID_MPM);	/* proto */
 		ADDSHORT(frm, localid);			/* local ID */
 		break;
 	case IEEE80211_ACTION_MESHPEERING_CONFIRM:
 		KASSERT(peerid != 0, ("sending peer confirm without peer id"));
 		*frm++ = IEEE80211_MPM_BASE_SZ + 2;	/* length */
 		ADDSHORT(frm, IEEE80211_MPPID_MPM);	/* proto */
 		ADDSHORT(frm, localid);			/* local ID */
 		ADDSHORT(frm, peerid);			/* peer ID */
 		break;
 	case IEEE80211_ACTION_MESHPEERING_CLOSE:
 		if (peerid)
 			*frm++ = IEEE80211_MPM_MAX_SZ;	/* length */
 		else
 			*frm++ = IEEE80211_MPM_BASE_SZ + 2; /* length */
 		ADDSHORT(frm, IEEE80211_MPPID_MPM);	/* proto */
 		ADDSHORT(frm, localid);	/* local ID */
 		if (peerid)
 			ADDSHORT(frm, peerid);	/* peer ID */
 		ADDSHORT(frm, reason);
 		break;
 	}
 	return frm;
 }
 
 /*
  * Compute an Airtime Link Metric for the link with this node.
  *
  * Based on Draft 3.0 spec (11B.10, p.149).
  */
 /*
  * Max 802.11s overhead.
  */
 #define IEEE80211_MESH_MAXOVERHEAD \
 	(sizeof(struct ieee80211_qosframe_addr4) \
 	 + sizeof(struct ieee80211_meshcntl_ae10) \
 	+ sizeof(struct llc) \
 	+ IEEE80211_ADDR_LEN \
 	+ IEEE80211_WEP_IVLEN \
 	+ IEEE80211_WEP_KIDLEN \
 	+ IEEE80211_WEP_CRCLEN \
 	+ IEEE80211_WEP_MICLEN \
 	+ IEEE80211_CRC_LEN)
 uint32_t
 mesh_airtime_calc(struct ieee80211_node *ni)
 {
 #define M_BITS 8
 #define S_FACTOR (2 * M_BITS)
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = ni->ni_vap->iv_ifp;
 	const static int nbits = 8192 << M_BITS;
 	uint32_t overhead, rate, errrate;
 	uint64_t res;
 
 	/* Time to transmit a frame */
 	rate = ni->ni_txrate;
 	overhead = ieee80211_compute_duration(ic->ic_rt,
 	    ifp->if_mtu + IEEE80211_MESH_MAXOVERHEAD, rate, 0) << M_BITS;
 	/* Error rate in percentage */
 	/* XXX assuming small failures are ok */
 	errrate = (((ifp->if_get_counter(ifp, IFCOUNTER_OERRORS) +
 	    ifp->if_get_counter(ifp, IFCOUNTER_IERRORS)) / 100) << M_BITS)
 	    / 100;
 	res = (overhead + (nbits / rate)) *
 	    ((1 << S_FACTOR) / ((1 << M_BITS) - errrate));
 
 	return (uint32_t)(res >> S_FACTOR);
 #undef M_BITS
 #undef S_FACTOR
 }
 
 /*
  * Add a Mesh Link Metric report IE to a frame.
  */
 uint8_t *
 ieee80211_add_meshlmetric(uint8_t *frm, uint8_t flags, uint32_t metric)
 {
 	*frm++ = IEEE80211_ELEMID_MESHLINK;
 	*frm++ = 5;
 	*frm++ = flags;
 	ADDWORD(frm, metric);
 	return frm;
 }
 
 /*
  * Add a Mesh Gate Announcement IE to a frame.
  */
 uint8_t *
 ieee80211_add_meshgate(uint8_t *frm, struct ieee80211_meshgann_ie *ie)
 {
 	*frm++ = IEEE80211_ELEMID_MESHGANN; /* ie */
 	*frm++ = IEEE80211_MESHGANN_BASE_SZ; /* len */
 	*frm++ = ie->gann_flags;
 	*frm++ = ie->gann_hopcount;
 	*frm++ = ie->gann_ttl;
 	IEEE80211_ADDR_COPY(frm, ie->gann_addr);
 	frm += 6;
 	ADDWORD(frm, ie->gann_seq);
 	ADDSHORT(frm, ie->gann_interval);
 	return frm;
 }
 #undef ADDSHORT
 #undef ADDWORD
 
 /*
  * Initialize any mesh-specific node state.
  */
 void
 ieee80211_mesh_node_init(struct ieee80211vap *vap, struct ieee80211_node *ni)
 {
 	ni->ni_flags |= IEEE80211_NODE_QOS;
 	callout_init(&ni->ni_mltimer, 1);
 	callout_init(&ni->ni_mlhtimer, 1);
 }
 
 /*
  * Cleanup any mesh-specific node state.
  */
 void
 ieee80211_mesh_node_cleanup(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 
 	callout_drain(&ni->ni_mltimer);
 	callout_drain(&ni->ni_mlhtimer);
 	/* NB: short-circuit callbacks after mesh_vdetach */
 	if (vap->iv_mesh != NULL)
 		ms->ms_ppath->mpp_peerdown(ni);
 }
 
 void
 ieee80211_parse_meshid(struct ieee80211_node *ni, const uint8_t *ie)
 {
 	ni->ni_meshidlen = ie[1];
 	memcpy(ni->ni_meshid, ie + 2, ie[1]);
 }
 
 /*
  * Setup mesh-specific node state on neighbor discovery.
  */
 void
 ieee80211_mesh_init_neighbor(struct ieee80211_node *ni,
 	const struct ieee80211_frame *wh,
 	const struct ieee80211_scanparams *sp)
 {
 	ieee80211_parse_meshid(ni, sp->meshid);
 }
 
 void
 ieee80211_mesh_update_beacon(struct ieee80211vap *vap,
 	struct ieee80211_beacon_offsets *bo)
 {
 	KASSERT(vap->iv_opmode == IEEE80211_M_MBSS, ("not a MBSS vap"));
 
 	if (isset(bo->bo_flags, IEEE80211_BEACON_MESHCONF)) {
 		(void)ieee80211_add_meshconf(bo->bo_meshconf, vap);
 		clrbit(bo->bo_flags, IEEE80211_BEACON_MESHCONF);
 	}
 }
 
 static int
 mesh_ioctl_get80211(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	uint8_t tmpmeshid[IEEE80211_NWID_LEN];
 	struct ieee80211_mesh_route *rt;
 	struct ieee80211req_mesh_route *imr;
 	size_t len, off;
 	uint8_t *p;
 	int error;
 
 	if (vap->iv_opmode != IEEE80211_M_MBSS)
 		return ENOSYS;
 
 	error = 0;
 	switch (ireq->i_type) {
 	case IEEE80211_IOC_MESH_ID:
 		ireq->i_len = ms->ms_idlen;
 		memcpy(tmpmeshid, ms->ms_id, ireq->i_len);
 		error = copyout(tmpmeshid, ireq->i_data, ireq->i_len);
 		break;
 	case IEEE80211_IOC_MESH_AP:
 		ireq->i_val = (ms->ms_flags & IEEE80211_MESHFLAGS_AP) != 0;
 		break;
 	case IEEE80211_IOC_MESH_FWRD:
 		ireq->i_val = (ms->ms_flags & IEEE80211_MESHFLAGS_FWD) != 0;
 		break;
 	case IEEE80211_IOC_MESH_GATE:
 		ireq->i_val = (ms->ms_flags & IEEE80211_MESHFLAGS_GATE) != 0;
 		break;
 	case IEEE80211_IOC_MESH_TTL:
 		ireq->i_val = ms->ms_ttl;
 		break;
 	case IEEE80211_IOC_MESH_RTCMD:
 		switch (ireq->i_val) {
 		case IEEE80211_MESH_RTCMD_LIST:
 			len = 0;
 			MESH_RT_LOCK(ms);
 			TAILQ_FOREACH(rt, &ms->ms_routes, rt_next) {
 				len += sizeof(*imr);
 			}
 			MESH_RT_UNLOCK(ms);
 			if (len > ireq->i_len || ireq->i_len < sizeof(*imr)) {
 				ireq->i_len = len;
 				return ENOMEM;
 			}
 			ireq->i_len = len;
 			/* XXX M_WAIT? */
 			p = IEEE80211_MALLOC(len, M_TEMP,
 			    IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 			if (p == NULL)
 				return ENOMEM;
 			off = 0;
 			MESH_RT_LOCK(ms);
 			TAILQ_FOREACH(rt, &ms->ms_routes, rt_next) {
 				if (off >= len)
 					break;
 				imr = (struct ieee80211req_mesh_route *)
 				    (p + off);
 				IEEE80211_ADDR_COPY(imr->imr_dest,
 				    rt->rt_dest);
 				IEEE80211_ADDR_COPY(imr->imr_nexthop,
 				    rt->rt_nexthop);
 				imr->imr_metric = rt->rt_metric;
 				imr->imr_nhops = rt->rt_nhops;
 				imr->imr_lifetime =
 				    ieee80211_mesh_rt_update(rt, 0);
 				imr->imr_lastmseq = rt->rt_lastmseq;
 				imr->imr_flags = rt->rt_flags; /* last */
 				off += sizeof(*imr);
 			}
 			MESH_RT_UNLOCK(ms);
 			error = copyout(p, (uint8_t *)ireq->i_data,
 			    ireq->i_len);
 			IEEE80211_FREE(p, M_TEMP);
 			break;
 		case IEEE80211_MESH_RTCMD_FLUSH:
 		case IEEE80211_MESH_RTCMD_ADD:
 		case IEEE80211_MESH_RTCMD_DELETE:
 			return EINVAL;
 		default:
 			return ENOSYS;
 		}
 		break;
 	case IEEE80211_IOC_MESH_PR_METRIC:
 		len = strlen(ms->ms_pmetric->mpm_descr);
 		if (ireq->i_len < len)
 			return EINVAL;
 		ireq->i_len = len;
 		error = copyout(ms->ms_pmetric->mpm_descr,
 		    (uint8_t *)ireq->i_data, len);
 		break;
 	case IEEE80211_IOC_MESH_PR_PATH:
 		len = strlen(ms->ms_ppath->mpp_descr);
 		if (ireq->i_len < len)
 			return EINVAL;
 		ireq->i_len = len;
 		error = copyout(ms->ms_ppath->mpp_descr,
 		    (uint8_t *)ireq->i_data, len);
 		break;
 	default:
 		return ENOSYS;
 	}
 
 	return error;
 }
 IEEE80211_IOCTL_GET(mesh, mesh_ioctl_get80211);
 
 static int
 mesh_ioctl_set80211(struct ieee80211vap *vap, struct ieee80211req *ireq)
 {
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	uint8_t tmpmeshid[IEEE80211_NWID_LEN];
 	uint8_t tmpaddr[IEEE80211_ADDR_LEN];
 	char tmpproto[IEEE80211_MESH_PROTO_DSZ];
 	int error;
 
 	if (vap->iv_opmode != IEEE80211_M_MBSS)
 		return ENOSYS;
 
 	error = 0;
 	switch (ireq->i_type) {
 	case IEEE80211_IOC_MESH_ID:
 		if (ireq->i_val != 0 || ireq->i_len > IEEE80211_MESHID_LEN)
 			return EINVAL;
 		error = copyin(ireq->i_data, tmpmeshid, ireq->i_len);
 		if (error != 0)
 			break;
 		memset(ms->ms_id, 0, IEEE80211_NWID_LEN);
 		ms->ms_idlen = ireq->i_len;
 		memcpy(ms->ms_id, tmpmeshid, ireq->i_len);
 		error = ENETRESET;
 		break;
 	case IEEE80211_IOC_MESH_AP:
 		if (ireq->i_val)
 			ms->ms_flags |= IEEE80211_MESHFLAGS_AP;
 		else
 			ms->ms_flags &= ~IEEE80211_MESHFLAGS_AP;
 		error = ENETRESET;
 		break;
 	case IEEE80211_IOC_MESH_FWRD:
 		if (ireq->i_val)
 			ms->ms_flags |= IEEE80211_MESHFLAGS_FWD;
 		else
 			ms->ms_flags &= ~IEEE80211_MESHFLAGS_FWD;
 		mesh_gatemode_setup(vap);
 		break;
 	case IEEE80211_IOC_MESH_GATE:
 		if (ireq->i_val)
 			ms->ms_flags |= IEEE80211_MESHFLAGS_GATE;
 		else
 			ms->ms_flags &= ~IEEE80211_MESHFLAGS_GATE;
 		break;
 	case IEEE80211_IOC_MESH_TTL:
 		ms->ms_ttl = (uint8_t) ireq->i_val;
 		break;
 	case IEEE80211_IOC_MESH_RTCMD:
 		switch (ireq->i_val) {
 		case IEEE80211_MESH_RTCMD_LIST:
 			return EINVAL;
 		case IEEE80211_MESH_RTCMD_FLUSH:
 			ieee80211_mesh_rt_flush(vap);
 			break;
 		case IEEE80211_MESH_RTCMD_ADD:
 			error = copyin(ireq->i_data, tmpaddr,
 			    IEEE80211_ADDR_LEN);
 			if (error != 0)
 				break;
 			if (IEEE80211_ADDR_EQ(vap->iv_myaddr, tmpaddr) ||
 			    IEEE80211_ADDR_EQ(broadcastaddr, tmpaddr))
 				return EINVAL;
 			ieee80211_mesh_discover(vap, tmpaddr, NULL);
 			break;
 		case IEEE80211_MESH_RTCMD_DELETE:
 			error = copyin(ireq->i_data, tmpaddr,
 			    IEEE80211_ADDR_LEN);
 			if (error != 0)
 				break;
 			ieee80211_mesh_rt_del(vap, tmpaddr);
 			break;
 		default:
 			return ENOSYS;
 		}
 		break;
 	case IEEE80211_IOC_MESH_PR_METRIC:
 		error = copyin(ireq->i_data, tmpproto, sizeof(tmpproto));
 		if (error == 0) {
 			error = mesh_select_proto_metric(vap, tmpproto);
 			if (error == 0)
 				error = ENETRESET;
 		}
 		break;
 	case IEEE80211_IOC_MESH_PR_PATH:
 		error = copyin(ireq->i_data, tmpproto, sizeof(tmpproto));
 		if (error == 0) {
 			error = mesh_select_proto_path(vap, tmpproto);
 			if (error == 0)
 				error = ENETRESET;
 		}
 		break;
 	default:
 		return ENOSYS;
 	}
 	return error;
 }
 IEEE80211_IOCTL_SET(mesh, mesh_ioctl_set80211);
diff --git a/sys/net80211/ieee80211_output.c b/sys/net80211/ieee80211_output.c
index 07cc8ed1c3ed..bf3e48761684 100644
--- a/sys/net80211/ieee80211_output.c
+++ b/sys/net80211/ieee80211_output.c
@@ -1,4198 +1,4199 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001 Atsushi Onoe
  * Copyright (c) 2002-2009 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>   
 #include <sys/endian.h>
 
 #include <sys/socket.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_llc.h>
 #include <net/if_media.h>
+#include <net/if_private.h>
 #include <net/if_vlan_var.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_regdomain.h>
 #ifdef IEEE80211_SUPPORT_SUPERG
 #include <net80211/ieee80211_superg.h>
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 #include <net80211/ieee80211_tdma.h>
 #endif
 #include <net80211/ieee80211_wds.h>
 #include <net80211/ieee80211_mesh.h>
 #include <net80211/ieee80211_vht.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h> 
 #endif
 
 #ifdef INET
 #include <netinet/if_ether.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #endif
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #define	ETHER_HEADER_COPY(dst, src) \
 	memcpy(dst, src, sizeof(struct ether_header))
 
 static int ieee80211_fragment(struct ieee80211vap *, struct mbuf *,
 	u_int hdrsize, u_int ciphdrsize, u_int mtu);
 static	void ieee80211_tx_mgt_cb(struct ieee80211_node *, void *, int);
 
 #ifdef IEEE80211_DEBUG
 /*
  * Decide if an outbound management frame should be
  * printed when debugging is enabled.  This filters some
  * of the less interesting frames that come frequently
  * (e.g. beacons).
  */
 static __inline int
 doprint(struct ieee80211vap *vap, int subtype)
 {
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 		return (vap->iv_opmode == IEEE80211_M_IBSS);
 	}
 	return 1;
 }
 #endif
 
 /*
  * Transmit a frame to the given destination on the given VAP.
  *
  * It's up to the caller to figure out the details of who this
  * is going to and resolving the node.
  *
  * This routine takes care of queuing it for power save,
  * A-MPDU state stuff, fast-frames state stuff, encapsulation
  * if required, then passing it up to the driver layer.
  *
  * This routine (for now) consumes the mbuf and frees the node
  * reference; it ideally will return a TX status which reflects
  * whether the mbuf was consumed or not, so the caller can
  * free the mbuf (if appropriate) and the node reference (again,
  * if appropriate.)
  */
 int
 ieee80211_vap_pkt_send_dest(struct ieee80211vap *vap, struct mbuf *m,
     struct ieee80211_node *ni)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	int mcast;
 	int do_ampdu = 0;
 #ifdef IEEE80211_SUPPORT_SUPERG
 	int do_amsdu = 0;
 	int do_ampdu_amsdu = 0;
 	int no_ampdu = 1; /* Will be set to 0 if ampdu is active */
 	int do_ff = 0;
 #endif
 
 	if ((ni->ni_flags & IEEE80211_NODE_PWR_MGT) &&
 	    (m->m_flags & M_PWR_SAV) == 0) {
 		/*
 		 * Station in power save mode; pass the frame
 		 * to the 802.11 layer and continue.  We'll get
 		 * the frame back when the time is right.
 		 * XXX lose WDS vap linkage?
 		 */
 		if (ieee80211_pwrsave(ni, m) != 0)
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		ieee80211_free_node(ni);
 
 		/*
 		 * We queued it fine, so tell the upper layer
 		 * that we consumed it.
 		 */
 		return (0);
 	}
 	/* calculate priority so drivers can find the tx queue */
 	if (ieee80211_classify(ni, m)) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_OUTPUT,
 		    ni->ni_macaddr, NULL,
 		    "%s", "classification failure");
 		vap->iv_stats.is_tx_classify++;
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		m_freem(m);
 		ieee80211_free_node(ni);
 
 		/* XXX better status? */
 		return (0);
 	}
 	/*
 	 * Stash the node pointer.  Note that we do this after
 	 * any call to ieee80211_dwds_mcast because that code
 	 * uses any existing value for rcvif to identify the
 	 * interface it (might have been) received on.
 	 */
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	m->m_pkthdr.rcvif = (void *)ni;
 	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1: 0;
 
 	BPF_MTAP(ifp, m);		/* 802.3 tx */
 
 	/*
 	 * Figure out if we can do A-MPDU, A-MSDU or FF.
 	 *
 	 * A-MPDU depends upon vap/node config.
 	 * A-MSDU depends upon vap/node config.
 	 * FF depends upon vap config, IE and whether
 	 *  it's 11abg (and not 11n/11ac/etc.)
 	 *
 	 * Note that these flags indiciate whether we can do
 	 * it at all, rather than the situation (eg traffic type.)
 	 */
 	do_ampdu = ((ni->ni_flags & IEEE80211_NODE_AMPDU_TX) &&
 	    (vap->iv_flags_ht & IEEE80211_FHT_AMPDU_TX));
 #ifdef IEEE80211_SUPPORT_SUPERG
 	do_amsdu = ((ni->ni_flags & IEEE80211_NODE_AMSDU_TX) &&
 	    (vap->iv_flags_ht & IEEE80211_FHT_AMSDU_TX));
 	do_ff =
 	    ((ni->ni_flags & IEEE80211_NODE_HT) == 0) &&
 	    ((ni->ni_flags & IEEE80211_NODE_VHT) == 0) &&
 	    (IEEE80211_ATH_CAP(vap, ni, IEEE80211_NODE_FF));
 #endif
 
 	/*
 	 * Check if A-MPDU tx aggregation is setup or if we
 	 * should try to enable it.  The sta must be associated
 	 * with HT and A-MPDU enabled for use.  When the policy
 	 * routine decides we should enable A-MPDU we issue an
 	 * ADDBA request and wait for a reply.  The frame being
 	 * encapsulated will go out w/o using A-MPDU, or possibly
 	 * it might be collected by the driver and held/retransmit.
 	 * The default ic_ampdu_enable routine handles staggering
 	 * ADDBA requests in case the receiver NAK's us or we are
 	 * otherwise unable to establish a BA stream.
 	 *
 	 * Don't treat group-addressed frames as candidates for aggregation;
 	 * net80211 doesn't support 802.11aa-2012 and so group addressed
 	 * frames will always have sequence numbers allocated from the NON_QOS
 	 * TID.
 	 */
 	if (do_ampdu) {
 		if ((m->m_flags & M_EAPOL) == 0 && (! mcast)) {
 			int tid = WME_AC_TO_TID(M_WME_GETAC(m));
 			struct ieee80211_tx_ampdu *tap = &ni->ni_tx_ampdu[tid];
 
 			ieee80211_txampdu_count_packet(tap);
 			if (IEEE80211_AMPDU_RUNNING(tap)) {
 				/*
 				 * Operational, mark frame for aggregation.
 				 *
 				 * XXX do tx aggregation here
 				 */
 				m->m_flags |= M_AMPDU_MPDU;
 			} else if (!IEEE80211_AMPDU_REQUESTED(tap) &&
 			    ic->ic_ampdu_enable(ni, tap)) {
 				/*
 				 * Not negotiated yet, request service.
 				 */
 				ieee80211_ampdu_request(ni, tap);
 				/* XXX hold frame for reply? */
 			}
 			/*
 			 * Now update the no-ampdu flag.  A-MPDU may have been
 			 * started or administratively disabled above; so now we
 			 * know whether we're running yet or not.
 			 *
 			 * This will let us know whether we should be doing A-MSDU
 			 * at this point.  We only do A-MSDU if we're either not
 			 * doing A-MPDU, or A-MPDU is NACKed, or A-MPDU + A-MSDU
 			 * is available.
 			 *
 			 * Whilst here, update the amsdu-ampdu flag.  The above may
 			 * have also set or cleared the amsdu-in-ampdu txa_flags
 			 * combination so we can correctly do A-MPDU + A-MSDU.
 			 */
 #ifdef IEEE80211_SUPPORT_SUPERG
 			no_ampdu = (! IEEE80211_AMPDU_RUNNING(tap)
 			    || (IEEE80211_AMPDU_NACKED(tap)));
 			do_ampdu_amsdu = IEEE80211_AMPDU_RUNNING_AMSDU(tap);
 #endif
 		}
 	}
 
 #ifdef IEEE80211_SUPPORT_SUPERG
 	/*
 	 * Check for AMSDU/FF; queue for aggregation
 	 *
 	 * Note: we don't bother trying to do fast frames or
 	 * A-MSDU encapsulation for 802.3 drivers.  Now, we
 	 * likely could do it for FF (because it's a magic
 	 * atheros tunnel LLC type) but I don't think we're going
 	 * to really need to.  For A-MSDU we'd have to set the
 	 * A-MSDU QoS bit in the wifi header, so we just plain
 	 * can't do it.
 	 */
 	if (__predict_true((vap->iv_caps & IEEE80211_C_8023ENCAP) == 0)) {
 		if ((! mcast) &&
 		    (do_ampdu_amsdu || (no_ampdu && do_amsdu)) &&
 		    ieee80211_amsdu_tx_ok(ni)) {
 			m = ieee80211_amsdu_check(ni, m);
 			if (m == NULL) {
 				/* NB: any ni ref held on stageq */
 				IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG,
 				    "%s: amsdu_check queued frame\n",
 				    __func__);
 				return (0);
 			}
 		} else if ((! mcast) && do_ff) {
 			m = ieee80211_ff_check(ni, m);
 			if (m == NULL) {
 				/* NB: any ni ref held on stageq */
 				IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG,
 				    "%s: ff_check queued frame\n",
 				    __func__);
 				return (0);
 			}
 		}
 	}
 #endif /* IEEE80211_SUPPORT_SUPERG */
 
 	/*
 	 * Grab the TX lock - serialise the TX process from this
 	 * point (where TX state is being checked/modified)
 	 * through to driver queue.
 	 */
 	IEEE80211_TX_LOCK(ic);
 
 	/*
 	 * XXX make the encap and transmit code a separate function
 	 * so things like the FF (and later A-MSDU) path can just call
 	 * it for flushed frames.
 	 */
 	if (__predict_true((vap->iv_caps & IEEE80211_C_8023ENCAP) == 0)) {
 		/*
 		 * Encapsulate the packet in prep for transmission.
 		 */
 		m = ieee80211_encap(vap, ni, m);
 		if (m == NULL) {
 			/* NB: stat+msg handled in ieee80211_encap */
 			IEEE80211_TX_UNLOCK(ic);
 			ieee80211_free_node(ni);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENOBUFS);
 		}
 	}
 	(void) ieee80211_parent_xmitpkt(ic, m);
 
 	/*
 	 * Unlock at this point - no need to hold it across
 	 * ieee80211_free_node() (ie, the comlock)
 	 */
 	IEEE80211_TX_UNLOCK(ic);
 	ic->ic_lastdata = ticks;
 
 	return (0);
 }
 
 /*
  * Send the given mbuf through the given vap.
  *
  * This consumes the mbuf regardless of whether the transmit
  * was successful or not.
  *
  * This does none of the initial checks that ieee80211_start()
  * does (eg CAC timeout, interface wakeup) - the caller must
  * do this first.
  */
 static int
 ieee80211_start_pkt(struct ieee80211vap *vap, struct mbuf *m)
 {
 #define	IS_DWDS(vap) \
 	(vap->iv_opmode == IEEE80211_M_WDS && \
 	 (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY) == 0)
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_node *ni;
 	struct ether_header *eh;
 
 	/*
 	 * Cancel any background scan.
 	 */
 	if (ic->ic_flags & IEEE80211_F_SCAN)
 		ieee80211_cancel_anyscan(vap);
 	/* 
 	 * Find the node for the destination so we can do
 	 * things like power save and fast frames aggregation.
 	 *
 	 * NB: past this point various code assumes the first
 	 *     mbuf has the 802.3 header present (and contiguous).
 	 */
 	ni = NULL;
 	if (m->m_len < sizeof(struct ether_header) &&
 	   (m = m_pullup(m, sizeof(struct ether_header))) == NULL) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT,
 		    "discard frame, %s\n", "m_pullup failed");
 		vap->iv_stats.is_tx_nobuf++;	/* XXX */
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		return (ENOBUFS);
 	}
 	eh = mtod(m, struct ether_header *);
 	if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 		if (IS_DWDS(vap)) {
 			/*
 			 * Only unicast frames from the above go out
 			 * DWDS vaps; multicast frames are handled by
 			 * dispatching the frame as it comes through
 			 * the AP vap (see below).
 			 */
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_WDS,
 			    eh->ether_dhost, "mcast", "%s", "on DWDS");
 			vap->iv_stats.is_dwds_mcast++;
 			m_freem(m);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			/* XXX better status? */
 			return (ENOBUFS);
 		}
 		if (vap->iv_opmode == IEEE80211_M_HOSTAP) {
 			/*
 			 * Spam DWDS vap's w/ multicast traffic.
 			 */
 			/* XXX only if dwds in use? */
 			ieee80211_dwds_mcast(vap, m);
 		}
 	}
 #ifdef IEEE80211_SUPPORT_MESH
 	if (vap->iv_opmode != IEEE80211_M_MBSS) {
 #endif
 		ni = ieee80211_find_txnode(vap, eh->ether_dhost);
 		if (ni == NULL) {
 			/* NB: ieee80211_find_txnode does stat+msg */
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(m);
 			/* XXX better status? */
 			return (ENOBUFS);
 		}
 		if (ni->ni_associd == 0 &&
 		    (ni->ni_flags & IEEE80211_NODE_ASSOCID)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_OUTPUT,
 			    eh->ether_dhost, NULL,
 			    "sta not associated (type 0x%04x)",
 			    htons(eh->ether_type));
 			vap->iv_stats.is_tx_notassoc++;
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(m);
 			ieee80211_free_node(ni);
 			/* XXX better status? */
 			return (ENOBUFS);
 		}
 #ifdef IEEE80211_SUPPORT_MESH
 	} else {
 		if (!IEEE80211_ADDR_EQ(eh->ether_shost, vap->iv_myaddr)) {
 			/*
 			 * Proxy station only if configured.
 			 */
 			if (!ieee80211_mesh_isproxyena(vap)) {
 				IEEE80211_DISCARD_MAC(vap,
 				    IEEE80211_MSG_OUTPUT |
 				    IEEE80211_MSG_MESH,
 				    eh->ether_dhost, NULL,
 				    "%s", "proxy not enabled");
 				vap->iv_stats.is_mesh_notproxy++;
 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 				m_freem(m);
 				/* XXX better status? */
 				return (ENOBUFS);
 			}
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT,
 			    "forward frame from DS SA(%6D), DA(%6D)\n",
 			    eh->ether_shost, ":",
 			    eh->ether_dhost, ":");
 			ieee80211_mesh_proxy_check(vap, eh->ether_shost);
 		}
 		ni = ieee80211_mesh_discover(vap, eh->ether_dhost, m);
 		if (ni == NULL) {
 			/*
 			 * NB: ieee80211_mesh_discover holds/disposes
 			 * frame (e.g. queueing on path discovery).
 			 */
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			/* XXX better status? */
 			return (ENOBUFS);
 		}
 	}
 #endif
 
 	/*
 	 * We've resolved the sender, so attempt to transmit it.
 	 */
 
 	if (vap->iv_state == IEEE80211_S_SLEEP) {
 		/*
 		 * In power save; queue frame and then  wakeup device
 		 * for transmit.
 		 */
 		ic->ic_lastdata = ticks;
 		if (ieee80211_pwrsave(ni, m) != 0)
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		ieee80211_free_node(ni);
 		ieee80211_new_state(vap, IEEE80211_S_RUN, 0);
 		return (0);
 	}
 
 	if (ieee80211_vap_pkt_send_dest(vap, m, ni) != 0)
 		return (ENOBUFS);
 	return (0);
 #undef	IS_DWDS
 }
 
 /*
  * Start method for vap's.  All packets from the stack come
  * through here.  We handle common processing of the packets
  * before dispatching them to the underlying device.
  *
  * if_transmit() requires that the mbuf be consumed by this call
  * regardless of the return condition.
  */
 int
 ieee80211_vap_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ieee80211vap *vap = ifp->if_softc;
 	struct ieee80211com *ic = vap->iv_ic;
 
 	/*
 	 * No data frames go out unless we're running.
 	 * Note in particular this covers CAC and CSA
 	 * states (though maybe we should check muting
 	 * for CSA).
 	 */
 	if (vap->iv_state != IEEE80211_S_RUN &&
 	    vap->iv_state != IEEE80211_S_SLEEP) {
 		IEEE80211_LOCK(ic);
 		/* re-check under the com lock to avoid races */
 		if (vap->iv_state != IEEE80211_S_RUN &&
 		    vap->iv_state != IEEE80211_S_SLEEP) {
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT,
 			    "%s: ignore queue, in %s state\n",
 			    __func__, ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_tx_badstate++;
 			IEEE80211_UNLOCK(ic);
 			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 			m_freem(m);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENETDOWN);
 		}
 		IEEE80211_UNLOCK(ic);
 	}
 
 	/*
 	 * Sanitize mbuf flags for net80211 use.  We cannot
 	 * clear M_PWR_SAV or M_MORE_DATA because these may
 	 * be set for frames that are re-submitted from the
 	 * power save queue.
 	 *
 	 * NB: This must be done before ieee80211_classify as
 	 *     it marks EAPOL in frames with M_EAPOL.
 	 */
 	m->m_flags &= ~(M_80211_TX - M_PWR_SAV - M_MORE_DATA);
 
 	/*
 	 * Bump to the packet transmission path.
 	 * The mbuf will be consumed here.
 	 */
 	return (ieee80211_start_pkt(vap, m));
 }
 
 void
 ieee80211_vap_qflush(struct ifnet *ifp)
 {
 
 	/* Empty for now */
 }
 
 /*
  * 802.11 raw output routine.
  *
  * XXX TODO: this (and other send routines) should correctly
  * XXX keep the pwr mgmt bit set if it decides to call into the
  * XXX driver to send a frame whilst the state is SLEEP.
  *
  * Otherwise the peer may decide that we're awake and flood us
  * with traffic we are still too asleep to receive!
  */
 int
 ieee80211_raw_output(struct ieee80211vap *vap, struct ieee80211_node *ni,
     struct mbuf *m, const struct ieee80211_bpf_params *params)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	int error;
 
 	/*
 	 * Set node - the caller has taken a reference, so ensure
 	 * that the mbuf has the same node value that
 	 * it would if it were going via the normal path.
 	 */
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	m->m_pkthdr.rcvif = (void *)ni;
 
 	/*
 	 * Attempt to add bpf transmit parameters.
 	 *
 	 * For now it's ok to fail; the raw_xmit api still takes
 	 * them as an option.
 	 *
 	 * Later on when ic_raw_xmit() has params removed,
 	 * they'll have to be added - so fail the transmit if
 	 * they can't be.
 	 */
 	if (params)
 		(void) ieee80211_add_xmit_params(m, params);
 
 	error = ic->ic_raw_xmit(ni, m, params);
 	if (error) {
 		if_inc_counter(vap->iv_ifp, IFCOUNTER_OERRORS, 1);
 		ieee80211_free_node(ni);
 	}
 	return (error);
 }
 
 static int
 ieee80211_validate_frame(struct mbuf *m,
     const struct ieee80211_bpf_params *params)
 {
 	struct ieee80211_frame *wh;
 	int type;
 
 	if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_ack))
 		return (EINVAL);
 
 	wh = mtod(m, struct ieee80211_frame *);
 	if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) !=
 	    IEEE80211_FC0_VERSION_0)
 		return (EINVAL);
 
 	type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK;
 	if (type != IEEE80211_FC0_TYPE_DATA) {
 		if ((wh->i_fc[1] & IEEE80211_FC1_DIR_MASK) !=
 		    IEEE80211_FC1_DIR_NODS)
 			return (EINVAL);
 
 		if (type != IEEE80211_FC0_TYPE_MGT &&
 		    (wh->i_fc[1] & IEEE80211_FC1_MORE_FRAG) != 0)
 			return (EINVAL);
 
 		/* XXX skip other field checks? */
 	}
 
 	if ((params && (params->ibp_flags & IEEE80211_BPF_CRYPTO) != 0) ||
 	    (IEEE80211_IS_PROTECTED(wh))) {
 		int subtype;
 
 		subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
 
 		/*
 		 * See IEEE Std 802.11-2012,
 		 * 8.2.4.1.9 'Protected Frame field'
 		 */
 		/* XXX no support for robust management frames yet. */
 		if (!(type == IEEE80211_FC0_TYPE_DATA ||
 		    (type == IEEE80211_FC0_TYPE_MGT &&
 		     subtype == IEEE80211_FC0_SUBTYPE_AUTH)))
 			return (EINVAL);
 
 		wh->i_fc[1] |= IEEE80211_FC1_PROTECTED;
 	}
 
 	if (m->m_pkthdr.len < ieee80211_anyhdrsize(wh))
 		return (EINVAL);
 
 	return (0);
 }
 
 static int
 ieee80211_validate_rate(struct ieee80211_node *ni, uint8_t rate)
 {
 	struct ieee80211com *ic = ni->ni_ic;
 
 	if (IEEE80211_IS_HT_RATE(rate)) {
 		if ((ic->ic_htcaps & IEEE80211_HTC_HT) == 0)
 			return (EINVAL);
 
 		rate = IEEE80211_RV(rate);
 		if (rate <= 31) {
 			if (rate > ic->ic_txstream * 8 - 1)
 				return (EINVAL);
 
 			return (0);
 		}
 
 		if (rate == 32) {
 			if ((ic->ic_htcaps & IEEE80211_HTC_TXMCS32) == 0)
 				return (EINVAL);
 
 			return (0);
 		}
 
 		if ((ic->ic_htcaps & IEEE80211_HTC_TXUNEQUAL) == 0)
 			return (EINVAL);
 
 		switch (ic->ic_txstream) {
 		case 0:
 		case 1:
 			return (EINVAL);
 		case 2:
 			if (rate > 38)
 				return (EINVAL);
 
 			return (0);
 		case 3:
 			if (rate > 52)
 				return (EINVAL);
 
 			return (0);
 		case 4:
 		default:
 			if (rate > 76)
 				return (EINVAL);
 
 			return (0);
 		}
 	}
 
 	if (!ieee80211_isratevalid(ic->ic_rt, rate))
 		return (EINVAL);
 
 	return (0);
 }
 
 static int
 ieee80211_sanitize_rates(struct ieee80211_node *ni, struct mbuf *m,
     const struct ieee80211_bpf_params *params)
 {
 	int error;
 
 	if (!params)
 		return (0);	/* nothing to do */
 
 	/* NB: most drivers assume that ibp_rate0 is set (!= 0). */
 	if (params->ibp_rate0 != 0) {
 		error = ieee80211_validate_rate(ni, params->ibp_rate0);
 		if (error != 0)
 			return (error);
 	} else {
 		/* XXX pre-setup some default (e.g., mgmt / mcast) rate */
 		/* XXX __DECONST? */
 		(void) m;
 	}
 
 	if (params->ibp_rate1 != 0 &&
 	    (error = ieee80211_validate_rate(ni, params->ibp_rate1)) != 0)
 		return (error);
 
 	if (params->ibp_rate2 != 0 &&
 	    (error = ieee80211_validate_rate(ni, params->ibp_rate2)) != 0)
 		return (error);
 
 	if (params->ibp_rate3 != 0 &&
 	    (error = ieee80211_validate_rate(ni, params->ibp_rate3)) != 0)
 		return (error);
 
 	return (0);
 }
 
 /*
  * 802.11 output routine. This is (currently) used only to
  * connect bpf write calls to the 802.11 layer for injecting
  * raw 802.11 frames.
  */
 int
 ieee80211_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 #define senderr(e) do { error = (e); goto bad;} while (0)
 	const struct ieee80211_bpf_params *params = NULL;
 	struct ieee80211_node *ni = NULL;
 	struct ieee80211vap *vap;
 	struct ieee80211_frame *wh;
 	struct ieee80211com *ic = NULL;
 	int error;
 	int ret;
 
 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE) {
 		/*
 		 * Short-circuit requests if the vap is marked OACTIVE
 		 * as this can happen because a packet came down through
 		 * ieee80211_start before the vap entered RUN state in
 		 * which case it's ok to just drop the frame.  This
 		 * should not be necessary but callers of if_output don't
 		 * check OACTIVE.
 		 */
 		senderr(ENETDOWN);
 	}
 	vap = ifp->if_softc;
 	ic = vap->iv_ic;
 	/*
 	 * Hand to the 802.3 code if not tagged as
 	 * a raw 802.11 frame.
 	 */
 	if (dst->sa_family != AF_IEEE80211)
 		return vap->iv_output(ifp, m, dst, ro);
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m);
 	if (error)
 		senderr(error);
 #endif
 	if (ifp->if_flags & IFF_MONITOR)
 		senderr(ENETDOWN);
 	if (!IFNET_IS_UP_RUNNING(ifp))
 		senderr(ENETDOWN);
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_DPRINTF(vap,
 		    IEEE80211_MSG_OUTPUT | IEEE80211_MSG_DOTH,
 		    "block %s frame in CAC state\n", "raw data");
 		vap->iv_stats.is_tx_badstate++;
 		senderr(EIO);		/* XXX */
 	} else if (vap->iv_state == IEEE80211_S_SCAN)
 		senderr(EIO);
 	/* XXX bypass bridge, pfil, carp, etc. */
 
 	/*
 	 * NB: DLT_IEEE802_11_RADIO identifies the parameters are
 	 * present by setting the sa_len field of the sockaddr (yes,
 	 * this is a hack).
 	 * NB: we assume sa_data is suitably aligned to cast.
 	 */
 	if (dst->sa_len != 0)
 		params = (const struct ieee80211_bpf_params *)dst->sa_data;
 
 	error = ieee80211_validate_frame(m, params);
 	if (error != 0)
 		senderr(error);
 
 	wh = mtod(m, struct ieee80211_frame *);
 
 	/* locate destination node */
 	switch (wh->i_fc[1] & IEEE80211_FC1_DIR_MASK) {
 	case IEEE80211_FC1_DIR_NODS:
 	case IEEE80211_FC1_DIR_FROMDS:
 		ni = ieee80211_find_txnode(vap, wh->i_addr1);
 		break;
 	case IEEE80211_FC1_DIR_TODS:
 	case IEEE80211_FC1_DIR_DSTODS:
 		ni = ieee80211_find_txnode(vap, wh->i_addr3);
 		break;
 	default:
 		senderr(EDOOFUS);
 	}
 	if (ni == NULL) {
 		/*
 		 * Permit packets w/ bpf params through regardless
 		 * (see below about sa_len).
 		 */
 		if (dst->sa_len == 0)
 			senderr(EHOSTUNREACH);
 		ni = ieee80211_ref_node(vap->iv_bss);
 	}
 
 	/*
 	 * Sanitize mbuf for net80211 flags leaked from above.
 	 *
 	 * NB: This must be done before ieee80211_classify as
 	 *     it marks EAPOL in frames with M_EAPOL.
 	 */
 	m->m_flags &= ~M_80211_TX;
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	if (IEEE80211_IS_DATA(wh)) {
 		/* calculate priority so drivers can find the tx queue */
 		if (ieee80211_classify(ni, m))
 			senderr(EIO);		/* XXX */
 
 		/* NB: ieee80211_encap does not include 802.11 header */
 		IEEE80211_NODE_STAT_ADD(ni, tx_bytes,
 		    m->m_pkthdr.len - ieee80211_hdrsize(wh));
 	} else
 		M_WME_SETAC(m, WME_AC_BE);
 
 	error = ieee80211_sanitize_rates(ni, m, params);
 	if (error != 0)
 		senderr(error);
 
 	IEEE80211_NODE_STAT(ni, tx_data);
 	if (IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 		IEEE80211_NODE_STAT(ni, tx_mcast);
 		m->m_flags |= M_MCAST;
 	} else
 		IEEE80211_NODE_STAT(ni, tx_ucast);
 
 	IEEE80211_TX_LOCK(ic);
 	ret = ieee80211_raw_output(vap, ni, m, params);
 	IEEE80211_TX_UNLOCK(ic);
 	return (ret);
 bad:
 	if (m != NULL)
 		m_freem(m);
 	if (ni != NULL)
 		ieee80211_free_node(ni);
 	if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	return error;
 #undef senderr
 }
 
 /*
  * Set the direction field and address fields of an outgoing
  * frame.  Note this should be called early on in constructing
  * a frame as it sets i_fc[1]; other bits can then be or'd in.
  */
 void
 ieee80211_send_setup(
 	struct ieee80211_node *ni,
 	struct mbuf *m,
 	int type, int tid,
 	const uint8_t sa[IEEE80211_ADDR_LEN],
 	const uint8_t da[IEEE80211_ADDR_LEN],
 	const uint8_t bssid[IEEE80211_ADDR_LEN])
 {
 #define	WH4(wh)	((struct ieee80211_frame_addr4 *)wh)
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_tx_ampdu *tap;
 	struct ieee80211_frame *wh = mtod(m, struct ieee80211_frame *);
 	ieee80211_seq seqno;
 
 	IEEE80211_TX_LOCK_ASSERT(ni->ni_ic);
 
 	wh->i_fc[0] = IEEE80211_FC0_VERSION_0 | type;
 	if ((type & IEEE80211_FC0_TYPE_MASK) == IEEE80211_FC0_TYPE_DATA) {
 		switch (vap->iv_opmode) {
 		case IEEE80211_M_STA:
 			wh->i_fc[1] = IEEE80211_FC1_DIR_TODS;
 			IEEE80211_ADDR_COPY(wh->i_addr1, bssid);
 			IEEE80211_ADDR_COPY(wh->i_addr2, sa);
 			IEEE80211_ADDR_COPY(wh->i_addr3, da);
 			break;
 		case IEEE80211_M_IBSS:
 		case IEEE80211_M_AHDEMO:
 			wh->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 			IEEE80211_ADDR_COPY(wh->i_addr1, da);
 			IEEE80211_ADDR_COPY(wh->i_addr2, sa);
 			IEEE80211_ADDR_COPY(wh->i_addr3, bssid);
 			break;
 		case IEEE80211_M_HOSTAP:
 			wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS;
 			IEEE80211_ADDR_COPY(wh->i_addr1, da);
 			IEEE80211_ADDR_COPY(wh->i_addr2, bssid);
 			IEEE80211_ADDR_COPY(wh->i_addr3, sa);
 			break;
 		case IEEE80211_M_WDS:
 			wh->i_fc[1] = IEEE80211_FC1_DIR_DSTODS;
 			IEEE80211_ADDR_COPY(wh->i_addr1, da);
 			IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr);
 			IEEE80211_ADDR_COPY(wh->i_addr3, da);
 			IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, sa);
 			break;
 		case IEEE80211_M_MBSS:
 #ifdef IEEE80211_SUPPORT_MESH
 			if (IEEE80211_IS_MULTICAST(da)) {
 				wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS;
 				/* XXX next hop */
 				IEEE80211_ADDR_COPY(wh->i_addr1, da);
 				IEEE80211_ADDR_COPY(wh->i_addr2,
 				    vap->iv_myaddr);
 			} else {
 				wh->i_fc[1] = IEEE80211_FC1_DIR_DSTODS;
 				IEEE80211_ADDR_COPY(wh->i_addr1, da);
 				IEEE80211_ADDR_COPY(wh->i_addr2,
 				    vap->iv_myaddr);
 				IEEE80211_ADDR_COPY(wh->i_addr3, da);
 				IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, sa);
 			}
 #endif
 			break;
 		case IEEE80211_M_MONITOR:	/* NB: to quiet compiler */
 			break;
 		}
 	} else {
 		wh->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 		IEEE80211_ADDR_COPY(wh->i_addr1, da);
 		IEEE80211_ADDR_COPY(wh->i_addr2, sa);
 #ifdef IEEE80211_SUPPORT_MESH
 		if (vap->iv_opmode == IEEE80211_M_MBSS)
 			IEEE80211_ADDR_COPY(wh->i_addr3, sa);
 		else
 #endif
 			IEEE80211_ADDR_COPY(wh->i_addr3, bssid);
 	}
 	*(uint16_t *)&wh->i_dur[0] = 0;
 
 	/*
 	 * XXX TODO: this is what the TX lock is for.
 	 * Here we're incrementing sequence numbers, and they
 	 * need to be in lock-step with what the driver is doing
 	 * both in TX ordering and crypto encap (IV increment.)
 	 *
 	 * If the driver does seqno itself, then we can skip
 	 * assigning sequence numbers here, and we can avoid
 	 * requiring the TX lock.
 	 */
 	tap = &ni->ni_tx_ampdu[tid];
 	if (tid != IEEE80211_NONQOS_TID && IEEE80211_AMPDU_RUNNING(tap)) {
 		m->m_flags |= M_AMPDU_MPDU;
 
 		/* NB: zero out i_seq field (for s/w encryption etc) */
 		*(uint16_t *)&wh->i_seq[0] = 0;
 	} else {
 		if (IEEE80211_HAS_SEQ(type & IEEE80211_FC0_TYPE_MASK,
 				      type & IEEE80211_FC0_SUBTYPE_MASK))
 			/*
 			 * 802.11-2012 9.3.2.10 - QoS multicast frames
 			 * come out of a different seqno space.
 			 */
 			if (IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 				seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++;
 			} else {
 				seqno = ni->ni_txseqs[tid]++;
 			}
 		else
 			seqno = 0;
 
 		*(uint16_t *)&wh->i_seq[0] =
 		    htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT);
 		M_SEQNO_SET(m, seqno);
 	}
 
 	if (IEEE80211_IS_MULTICAST(wh->i_addr1))
 		m->m_flags |= M_MCAST;
 #undef WH4
 }
 
 /*
  * Send a management frame to the specified node.  The node pointer
  * must have a reference as the pointer will be passed to the driver
  * and potentially held for a long time.  If the frame is successfully
  * dispatched to the driver, then it is responsible for freeing the
  * reference (and potentially free'ing up any associated storage);
  * otherwise deal with reclaiming any reference (on error).
  */
 int
 ieee80211_mgmt_output(struct ieee80211_node *ni, struct mbuf *m, int type,
 	struct ieee80211_bpf_params *params)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_frame *wh;
 	int ret;
 
 	KASSERT(ni != NULL, ("null node"));
 
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT | IEEE80211_MSG_DOTH,
 		    ni, "block %s frame in CAC state",
 			ieee80211_mgt_subtype_name(type));
 		vap->iv_stats.is_tx_badstate++;
 		ieee80211_free_node(ni);
 		m_freem(m);
 		return EIO;		/* XXX */
 	}
 
 	M_PREPEND(m, sizeof(struct ieee80211_frame), IEEE80211_M_NOWAIT);
 	if (m == NULL) {
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 
 	IEEE80211_TX_LOCK(ic);
 
 	wh = mtod(m, struct ieee80211_frame *);
 	ieee80211_send_setup(ni, m,
 	     IEEE80211_FC0_TYPE_MGT | type, IEEE80211_NONQOS_TID,
 	     vap->iv_myaddr, ni->ni_macaddr, ni->ni_bssid);
 	if (params->ibp_flags & IEEE80211_BPF_CRYPTO) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_AUTH, wh->i_addr1,
 		    "encrypting frame (%s)", __func__);
 		wh->i_fc[1] |= IEEE80211_FC1_PROTECTED;
 	}
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	KASSERT(type != IEEE80211_FC0_SUBTYPE_PROBE_RESP, ("probe response?"));
 	M_WME_SETAC(m, params->ibp_pri);
 
 #ifdef IEEE80211_DEBUG
 	/* avoid printing too many frames */
 	if ((ieee80211_msg_debug(vap) && doprint(vap, type)) ||
 	    ieee80211_msg_dumppkts(vap)) {
 		ieee80211_note(vap, "[%s] send %s on channel %u\n",
 		    ether_sprintf(wh->i_addr1),
 		    ieee80211_mgt_subtype_name(type),
 		    ieee80211_chan2ieee(ic, ic->ic_curchan));
 	}
 #endif
 	IEEE80211_NODE_STAT(ni, tx_mgmt);
 
 	ret = ieee80211_raw_output(vap, ni, m, params);
 	IEEE80211_TX_UNLOCK(ic);
 	return (ret);
 }
 
 static void
 ieee80211_nulldata_transmitted(struct ieee80211_node *ni, void *arg,
     int status)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 
 	wakeup(vap);
 }
 
 /*
  * Send a null data frame to the specified node.  If the station
  * is setup for QoS then a QoS Null Data frame is constructed.
  * If this is a WDS station then a 4-address frame is constructed.
  *
  * NB: the caller is assumed to have setup a node reference
  *     for use; this is necessary to deal with a race condition
  *     when probing for inactive stations.  Like ieee80211_mgmt_output
  *     we must cleanup any node reference on error;  however we
  *     can safely just unref it as we know it will never be the
  *     last reference to the node.
  */
 int
 ieee80211_send_nulldata(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct mbuf *m;
 	struct ieee80211_frame *wh;
 	int hdrlen;
 	uint8_t *frm;
 	int ret;
 
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT | IEEE80211_MSG_DOTH,
 		    ni, "block %s frame in CAC state", "null data");
 		ieee80211_unref_node(&ni);
 		vap->iv_stats.is_tx_badstate++;
 		return EIO;		/* XXX */
 	}
 
 	if (ni->ni_flags & (IEEE80211_NODE_QOS|IEEE80211_NODE_HT))
 		hdrlen = sizeof(struct ieee80211_qosframe);
 	else
 		hdrlen = sizeof(struct ieee80211_frame);
 	/* NB: only WDS vap's get 4-address frames */
 	if (vap->iv_opmode == IEEE80211_M_WDS)
 		hdrlen += IEEE80211_ADDR_LEN;
 	if (ic->ic_flags & IEEE80211_F_DATAPAD)
 		hdrlen = roundup(hdrlen, sizeof(uint32_t));
 
 	m = ieee80211_getmgtframe(&frm, ic->ic_headroom + hdrlen, 0);
 	if (m == NULL) {
 		/* XXX debug msg */
 		ieee80211_unref_node(&ni);
 		vap->iv_stats.is_tx_nobuf++;
 		return ENOMEM;
 	}
 	KASSERT(M_LEADINGSPACE(m) >= hdrlen,
 	    ("leading space %zd", M_LEADINGSPACE(m)));
 	M_PREPEND(m, hdrlen, IEEE80211_M_NOWAIT);
 	if (m == NULL) {
 		/* NB: cannot happen */
 		ieee80211_free_node(ni);
 		return ENOMEM;
 	}
 
 	IEEE80211_TX_LOCK(ic);
 
 	wh = mtod(m, struct ieee80211_frame *);		/* NB: a little lie */
 	if (ni->ni_flags & IEEE80211_NODE_QOS) {
 		const int tid = WME_AC_TO_TID(WME_AC_BE);
 		uint8_t *qos;
 
 		ieee80211_send_setup(ni, m,
 		    IEEE80211_FC0_TYPE_DATA | IEEE80211_FC0_SUBTYPE_QOS_NULL,
 		    tid, vap->iv_myaddr, ni->ni_macaddr, ni->ni_bssid);
 
 		if (vap->iv_opmode == IEEE80211_M_WDS)
 			qos = ((struct ieee80211_qosframe_addr4 *) wh)->i_qos;
 		else
 			qos = ((struct ieee80211_qosframe *) wh)->i_qos;
 		qos[0] = tid & IEEE80211_QOS_TID;
 		if (ic->ic_wme.wme_wmeChanParams.cap_wmeParams[WME_AC_BE].wmep_noackPolicy)
 			qos[0] |= IEEE80211_QOS_ACKPOLICY_NOACK;
 		qos[1] = 0;
 	} else {
 		ieee80211_send_setup(ni, m,
 		    IEEE80211_FC0_TYPE_DATA | IEEE80211_FC0_SUBTYPE_NODATA,
 		    IEEE80211_NONQOS_TID,
 		    vap->iv_myaddr, ni->ni_macaddr, ni->ni_bssid);
 	}
 	if (vap->iv_opmode != IEEE80211_M_WDS) {
 		/* NB: power management bit is never sent by an AP */
 		if ((ni->ni_flags & IEEE80211_NODE_PWR_MGT) &&
 		    vap->iv_opmode != IEEE80211_M_HOSTAP)
 			wh->i_fc[1] |= IEEE80211_FC1_PWR_MGT;
 	}
 	if ((ic->ic_flags & IEEE80211_F_SCAN) &&
 	    (ni->ni_flags & IEEE80211_NODE_PWR_MGT)) {
 		ieee80211_add_callback(m, ieee80211_nulldata_transmitted,
 		    NULL);
 	}
 	m->m_len = m->m_pkthdr.len = hdrlen;
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	M_WME_SETAC(m, WME_AC_BE);
 
 	IEEE80211_NODE_STAT(ni, tx_data);
 
 	IEEE80211_NOTE(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_DUMPPKTS, ni,
 	    "send %snull data frame on channel %u, pwr mgt %s",
 	    ni->ni_flags & IEEE80211_NODE_QOS ? "QoS " : "",
 	    ieee80211_chan2ieee(ic, ic->ic_curchan),
 	    wh->i_fc[1] & IEEE80211_FC1_PWR_MGT ? "ena" : "dis");
 
 	ret = ieee80211_raw_output(vap, ni, m, NULL);
 	IEEE80211_TX_UNLOCK(ic);
 	return (ret);
 }
 
 /* 
  * Assign priority to a frame based on any vlan tag assigned
  * to the station and/or any Diffserv setting in an IP header.
  * Finally, if an ACM policy is setup (in station mode) it's
  * applied.
  */
 int
 ieee80211_classify(struct ieee80211_node *ni, struct mbuf *m)
 {
 	const struct ether_header *eh = NULL;
 	uint16_t ether_type;
 	int v_wme_ac, d_wme_ac, ac;
 
 	if (__predict_false(m->m_flags & M_ENCAP)) {
 		struct ieee80211_frame *wh = mtod(m, struct ieee80211_frame *);
 		struct llc *llc;
 		int hdrlen, subtype;
 
 		subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
 		if (subtype & IEEE80211_FC0_SUBTYPE_NODATA) {
 			ac = WME_AC_BE;
 			goto done;
 		}
 
 		hdrlen = ieee80211_hdrsize(wh);
 		if (m->m_pkthdr.len < hdrlen + sizeof(*llc))
 			return 1;
 
 		llc = (struct llc *)mtodo(m, hdrlen);
 		if (llc->llc_dsap != LLC_SNAP_LSAP ||
 		    llc->llc_ssap != LLC_SNAP_LSAP ||
 		    llc->llc_control != LLC_UI ||
 		    llc->llc_snap.org_code[0] != 0 ||
 		    llc->llc_snap.org_code[1] != 0 ||
 		    llc->llc_snap.org_code[2] != 0)
 			return 1;
 
 		ether_type = llc->llc_snap.ether_type;
 	} else {
 		eh = mtod(m, struct ether_header *);
 		ether_type = eh->ether_type;
 	}
 
 	/*
 	 * Always promote PAE/EAPOL frames to high priority.
 	 */
 	if (ether_type == htons(ETHERTYPE_PAE)) {
 		/* NB: mark so others don't need to check header */
 		m->m_flags |= M_EAPOL;
 		ac = WME_AC_VO;
 		goto done;
 	}
 	/*
 	 * Non-qos traffic goes to BE.
 	 */
 	if ((ni->ni_flags & IEEE80211_NODE_QOS) == 0) {
 		ac = WME_AC_BE;
 		goto done;
 	}
 
 	/* 
 	 * If node has a vlan tag then all traffic
 	 * to it must have a matching tag.
 	 */
 	v_wme_ac = 0;
 	if (ni->ni_vlan != 0) {
 		 if ((m->m_flags & M_VLANTAG) == 0) {
 			IEEE80211_NODE_STAT(ni, tx_novlantag);
 			return 1;
 		}
 		if (EVL_VLANOFTAG(m->m_pkthdr.ether_vtag) !=
 		    EVL_VLANOFTAG(ni->ni_vlan)) {
 			IEEE80211_NODE_STAT(ni, tx_vlanmismatch);
 			return 1;
 		}
 		/* map vlan priority to AC */
 		v_wme_ac = TID_TO_WME_AC(EVL_PRIOFTAG(ni->ni_vlan));
 	}
 
 	if (eh == NULL)
 		goto no_eh;
 
 	/* XXX m_copydata may be too slow for fast path */
 	switch (ntohs(eh->ether_type)) {
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		uint8_t tos;
 		/*
 		 * IP frame, map the DSCP bits from the TOS field.
 		 */
 		/* NB: ip header may not be in first mbuf */
 		m_copydata(m, sizeof(struct ether_header) +
 		    offsetof(struct ip, ip_tos), sizeof(tos), &tos);
 		tos >>= 5;		/* NB: ECN + low 3 bits of DSCP */
 		d_wme_ac = TID_TO_WME_AC(tos);
 		break;
 	}
 #endif
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		uint32_t flow;
 		uint8_t tos;
 		/*
 		 * IPv6 frame, map the DSCP bits from the traffic class field.
 		 */
 		m_copydata(m, sizeof(struct ether_header) +
 		    offsetof(struct ip6_hdr, ip6_flow), sizeof(flow),
 		    (caddr_t) &flow);
 		tos = (uint8_t)(ntohl(flow) >> 20);
 		tos >>= 5;		/* NB: ECN + low 3 bits of DSCP */
 		d_wme_ac = TID_TO_WME_AC(tos);
 		break;
 	}
 #endif
 	default:
 no_eh:
 		d_wme_ac = WME_AC_BE;
 		break;
 	}
 
 	/*
 	 * Use highest priority AC.
 	 */
 	if (v_wme_ac > d_wme_ac)
 		ac = v_wme_ac;
 	else
 		ac = d_wme_ac;
 
 	/*
 	 * Apply ACM policy.
 	 */
 	if (ni->ni_vap->iv_opmode == IEEE80211_M_STA) {
 		static const int acmap[4] = {
 			WME_AC_BK,	/* WME_AC_BE */
 			WME_AC_BK,	/* WME_AC_BK */
 			WME_AC_BE,	/* WME_AC_VI */
 			WME_AC_VI,	/* WME_AC_VO */
 		};
 		struct ieee80211com *ic = ni->ni_ic;
 
 		while (ac != WME_AC_BK &&
 		    ic->ic_wme.wme_wmeBssChanParams.cap_wmeParams[ac].wmep_acm)
 			ac = acmap[ac];
 	}
 done:
 	M_WME_SETAC(m, ac);
 	return 0;
 }
 
 /*
  * Insure there is sufficient contiguous space to encapsulate the
  * 802.11 data frame.  If room isn't already there, arrange for it.
  * Drivers and cipher modules assume we have done the necessary work
  * and fail rudely if they don't find the space they need.
  */
 struct mbuf *
 ieee80211_mbuf_adjust(struct ieee80211vap *vap, int hdrsize,
 	struct ieee80211_key *key, struct mbuf *m)
 {
 #define	TO_BE_RECLAIMED	(sizeof(struct ether_header) - sizeof(struct llc))
 	int needed_space = vap->iv_ic->ic_headroom + hdrsize;
 
 	if (key != NULL) {
 		/* XXX belongs in crypto code? */
 		needed_space += key->wk_cipher->ic_header;
 		/* XXX frags */
 		/*
 		 * When crypto is being done in the host we must insure
 		 * the data are writable for the cipher routines; clone
 		 * a writable mbuf chain.
 		 * XXX handle SWMIC specially
 		 */
 		if (key->wk_flags & (IEEE80211_KEY_SWENCRYPT|IEEE80211_KEY_SWENMIC)) {
 			m = m_unshare(m, IEEE80211_M_NOWAIT);
 			if (m == NULL) {
 				IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT,
 				    "%s: cannot get writable mbuf\n", __func__);
 				vap->iv_stats.is_tx_nobuf++; /* XXX new stat */
 				return NULL;
 			}
 		}
 	}
 	/*
 	 * We know we are called just before stripping an Ethernet
 	 * header and prepending an LLC header.  This means we know
 	 * there will be
 	 *	sizeof(struct ether_header) - sizeof(struct llc)
 	 * bytes recovered to which we need additional space for the
 	 * 802.11 header and any crypto header.
 	 */
 	/* XXX check trailing space and copy instead? */
 	if (M_LEADINGSPACE(m) < needed_space - TO_BE_RECLAIMED) {
 		struct mbuf *n = m_gethdr(IEEE80211_M_NOWAIT, m->m_type);
 		if (n == NULL) {
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_OUTPUT,
 			    "%s: cannot expand storage\n", __func__);
 			vap->iv_stats.is_tx_nobuf++;
 			m_freem(m);
 			return NULL;
 		}
 		KASSERT(needed_space <= MHLEN,
 		    ("not enough room, need %u got %d\n", needed_space, MHLEN));
 		/*
 		 * Setup new mbuf to have leading space to prepend the
 		 * 802.11 header and any crypto header bits that are
 		 * required (the latter are added when the driver calls
 		 * back to ieee80211_crypto_encap to do crypto encapsulation).
 		 */
 		/* NB: must be first 'cuz it clobbers m_data */
 		m_move_pkthdr(n, m);
 		n->m_len = 0;			/* NB: m_gethdr does not set */
 		n->m_data += needed_space;
 		/*
 		 * Pull up Ethernet header to create the expected layout.
 		 * We could use m_pullup but that's overkill (i.e. we don't
 		 * need the actual data) and it cannot fail so do it inline
 		 * for speed.
 		 */
 		/* NB: struct ether_header is known to be contiguous */
 		n->m_len += sizeof(struct ether_header);
 		m->m_len -= sizeof(struct ether_header);
 		m->m_data += sizeof(struct ether_header);
 		/*
 		 * Replace the head of the chain.
 		 */
 		n->m_next = m;
 		m = n;
 	}
 	return m;
 #undef TO_BE_RECLAIMED
 }
 
 /*
  * Return the transmit key to use in sending a unicast frame.
  * If a unicast key is set we use that.  When no unicast key is set
  * we fall back to the default transmit key.
  */ 
 static __inline struct ieee80211_key *
 ieee80211_crypto_getucastkey(struct ieee80211vap *vap,
 	struct ieee80211_node *ni)
 {
 	if (IEEE80211_KEY_UNDEFINED(&ni->ni_ucastkey)) {
 		if (vap->iv_def_txkey == IEEE80211_KEYIX_NONE ||
 		    IEEE80211_KEY_UNDEFINED(&vap->iv_nw_keys[vap->iv_def_txkey]))
 			return NULL;
 		return &vap->iv_nw_keys[vap->iv_def_txkey];
 	} else {
 		return &ni->ni_ucastkey;
 	}
 }
 
 /*
  * Return the transmit key to use in sending a multicast frame.
  * Multicast traffic always uses the group key which is installed as
  * the default tx key.
  */ 
 static __inline struct ieee80211_key *
 ieee80211_crypto_getmcastkey(struct ieee80211vap *vap,
 	struct ieee80211_node *ni)
 {
 	if (vap->iv_def_txkey == IEEE80211_KEYIX_NONE ||
 	    IEEE80211_KEY_UNDEFINED(&vap->iv_nw_keys[vap->iv_def_txkey]))
 		return NULL;
 	return &vap->iv_nw_keys[vap->iv_def_txkey];
 }
 
 /*
  * Encapsulate an outbound data frame.  The mbuf chain is updated.
  * If an error is encountered NULL is returned.  The caller is required
  * to provide a node reference and pullup the ethernet header in the
  * first mbuf.
  *
  * NB: Packet is assumed to be processed by ieee80211_classify which
  *     marked EAPOL frames w/ M_EAPOL.
  */
 struct mbuf *
 ieee80211_encap(struct ieee80211vap *vap, struct ieee80211_node *ni,
     struct mbuf *m)
 {
 #define	WH4(wh)	((struct ieee80211_frame_addr4 *)(wh))
 #define MC01(mc)	((struct ieee80211_meshcntl_ae01 *)mc)
 	struct ieee80211com *ic = ni->ni_ic;
 #ifdef IEEE80211_SUPPORT_MESH
 	struct ieee80211_mesh_state *ms = vap->iv_mesh;
 	struct ieee80211_meshcntl_ae10 *mc;
 	struct ieee80211_mesh_route *rt = NULL;
 	int dir = -1;
 #endif
 	struct ether_header eh;
 	struct ieee80211_frame *wh;
 	struct ieee80211_key *key;
 	struct llc *llc;
 	int hdrsize, hdrspace, datalen, addqos, txfrag, is4addr, is_mcast;
 	ieee80211_seq seqno;
 	int meshhdrsize, meshae;
 	uint8_t *qos;
 	int is_amsdu = 0;
 
 	IEEE80211_TX_LOCK_ASSERT(ic);
 
 	is_mcast = !! (m->m_flags & (M_MCAST | M_BCAST));
 
 	/*
 	 * Copy existing Ethernet header to a safe place.  The
 	 * rest of the code assumes it's ok to strip it when
 	 * reorganizing state for the final encapsulation.
 	 */
 	KASSERT(m->m_len >= sizeof(eh), ("no ethernet header!"));
 	ETHER_HEADER_COPY(&eh, mtod(m, caddr_t));
 
 	/*
 	 * Insure space for additional headers.  First identify
 	 * transmit key to use in calculating any buffer adjustments
 	 * required.  This is also used below to do privacy
 	 * encapsulation work.  Then calculate the 802.11 header
 	 * size and any padding required by the driver.
 	 *
 	 * Note key may be NULL if we fall back to the default
 	 * transmit key and that is not set.  In that case the
 	 * buffer may not be expanded as needed by the cipher
 	 * routines, but they will/should discard it.
 	 */
 	if (vap->iv_flags & IEEE80211_F_PRIVACY) {
 		if (vap->iv_opmode == IEEE80211_M_STA ||
 		    !IEEE80211_IS_MULTICAST(eh.ether_dhost) ||
 		    (vap->iv_opmode == IEEE80211_M_WDS &&
 		     (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY))) {
 			key = ieee80211_crypto_getucastkey(vap, ni);
 		} else if ((vap->iv_opmode == IEEE80211_M_WDS) &&
 		    (! (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY))) {
 			/*
 			 * Use ucastkey for DWDS transmit nodes, multicast
 			 * or otherwise.
 			 *
 			 * This is required to ensure that multicast frames
 			 * from a DWDS AP to a DWDS STA is encrypted with
 			 * a key that can actually work.
 			 *
 			 * There's no default key for multicast traffic
 			 * on a DWDS WDS VAP node (note NOT the DWDS enabled
 			 * AP VAP, the dynamically created per-STA WDS node)
 			 * so encap fails and transmit fails.
 			 */
 			key = ieee80211_crypto_getucastkey(vap, ni);
 		} else {
 			key = ieee80211_crypto_getmcastkey(vap, ni);
 		}
 		if (key == NULL && (m->m_flags & M_EAPOL) == 0) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_CRYPTO,
 			    eh.ether_dhost,
 			    "no default transmit key (%s) deftxkey %u",
 			    __func__, vap->iv_def_txkey);
 			vap->iv_stats.is_tx_nodefkey++;
 			goto bad;
 		}
 	} else
 		key = NULL;
 	/*
 	 * XXX Some ap's don't handle QoS-encapsulated EAPOL
 	 * frames so suppress use.  This may be an issue if other
 	 * ap's require all data frames to be QoS-encapsulated
 	 * once negotiated in which case we'll need to make this
 	 * configurable.
 	 *
 	 * Don't send multicast QoS frames.
 	 * Technically multicast frames can be QoS if all stations in the
 	 * BSS are also QoS.
 	 *
 	 * NB: mesh data frames are QoS, including multicast frames.
 	 */
 	addqos =
 	    (((is_mcast == 0) && (ni->ni_flags &
 	     (IEEE80211_NODE_QOS|IEEE80211_NODE_HT))) ||
 	    (vap->iv_opmode == IEEE80211_M_MBSS)) &&
 	    (m->m_flags & M_EAPOL) == 0;
 
 	if (addqos)
 		hdrsize = sizeof(struct ieee80211_qosframe);
 	else
 		hdrsize = sizeof(struct ieee80211_frame);
 #ifdef IEEE80211_SUPPORT_MESH
 	if (vap->iv_opmode == IEEE80211_M_MBSS) {
 		/*
 		 * Mesh data frames are encapsulated according to the
 		 * rules of Section 11B.8.5 (p.139 of D3.0 spec).
 		 * o Group Addressed data (aka multicast) originating
 		 *   at the local sta are sent w/ 3-address format and
 		 *   address extension mode 00
 		 * o Individually Addressed data (aka unicast) originating
 		 *   at the local sta are sent w/ 4-address format and
 		 *   address extension mode 00
 		 * o Group Addressed data forwarded from a non-mesh sta are
 		 *   sent w/ 3-address format and address extension mode 01
 		 * o Individually Address data from another sta are sent
 		 *   w/ 4-address format and address extension mode 10
 		 */
 		is4addr = 0;		/* NB: don't use, disable */
 		if (!IEEE80211_IS_MULTICAST(eh.ether_dhost)) {
 			rt = ieee80211_mesh_rt_find(vap, eh.ether_dhost);
 			KASSERT(rt != NULL, ("route is NULL"));
 			dir = IEEE80211_FC1_DIR_DSTODS;
 			hdrsize += IEEE80211_ADDR_LEN;
 			if (rt->rt_flags & IEEE80211_MESHRT_FLAGS_PROXY) {
 				if (IEEE80211_ADDR_EQ(rt->rt_mesh_gate,
 				    vap->iv_myaddr)) {
 					IEEE80211_NOTE_MAC(vap,
 					    IEEE80211_MSG_MESH,
 					    eh.ether_dhost,
 					    "%s", "trying to send to ourself");
 					goto bad;
 				}
 				meshae = IEEE80211_MESH_AE_10;
 				meshhdrsize =
 				    sizeof(struct ieee80211_meshcntl_ae10);
 			} else {
 				meshae = IEEE80211_MESH_AE_00;
 				meshhdrsize =
 				    sizeof(struct ieee80211_meshcntl);
 			}
 		} else {
 			dir = IEEE80211_FC1_DIR_FROMDS;
 			if (!IEEE80211_ADDR_EQ(eh.ether_shost, vap->iv_myaddr)) {
 				/* proxy group */
 				meshae = IEEE80211_MESH_AE_01;
 				meshhdrsize =
 				    sizeof(struct ieee80211_meshcntl_ae01);
 			} else {
 				/* group */
 				meshae = IEEE80211_MESH_AE_00;
 				meshhdrsize = sizeof(struct ieee80211_meshcntl);
 			}
 		}
 	} else {
 #endif
 		/*
 		 * 4-address frames need to be generated for:
 		 * o packets sent through a WDS vap (IEEE80211_M_WDS)
 		 * o packets sent through a vap marked for relaying
 		 *   (e.g. a station operating with dynamic WDS)
 		 */
 		is4addr = vap->iv_opmode == IEEE80211_M_WDS ||
 		    ((vap->iv_flags_ext & IEEE80211_FEXT_4ADDR) &&
 		     !IEEE80211_ADDR_EQ(eh.ether_shost, vap->iv_myaddr));
 		if (is4addr)
 			hdrsize += IEEE80211_ADDR_LEN;
 		meshhdrsize = meshae = 0;
 #ifdef IEEE80211_SUPPORT_MESH
 	}
 #endif
 	/*
 	 * Honor driver DATAPAD requirement.
 	 */
 	if (ic->ic_flags & IEEE80211_F_DATAPAD)
 		hdrspace = roundup(hdrsize, sizeof(uint32_t));
 	else
 		hdrspace = hdrsize;
 
 	if (__predict_true((m->m_flags & M_FF) == 0)) {
 		/*
 		 * Normal frame.
 		 */
 		m = ieee80211_mbuf_adjust(vap, hdrspace + meshhdrsize, key, m);
 		if (m == NULL) {
 			/* NB: ieee80211_mbuf_adjust handles msgs+statistics */
 			goto bad;
 		}
 		/* NB: this could be optimized 'cuz of ieee80211_mbuf_adjust */
 		m_adj(m, sizeof(struct ether_header) - sizeof(struct llc));
 		llc = mtod(m, struct llc *);
 		llc->llc_dsap = llc->llc_ssap = LLC_SNAP_LSAP;
 		llc->llc_control = LLC_UI;
 		llc->llc_snap.org_code[0] = 0;
 		llc->llc_snap.org_code[1] = 0;
 		llc->llc_snap.org_code[2] = 0;
 		llc->llc_snap.ether_type = eh.ether_type;
 	} else {
 #ifdef IEEE80211_SUPPORT_SUPERG
 		/*
 		 * Aggregated frame.  Check if it's for AMSDU or FF.
 		 *
 		 * XXX TODO: IEEE80211_NODE_AMSDU* isn't implemented
 		 * anywhere for some reason.  But, since 11n requires
 		 * AMSDU RX, we can just assume "11n" == "AMSDU".
 		 */
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG, "%s: called; M_FF\n", __func__);
 		if (ieee80211_amsdu_tx_ok(ni)) {
 			m = ieee80211_amsdu_encap(vap, m, hdrspace + meshhdrsize, key);
 			is_amsdu = 1;
 		} else {
 			m = ieee80211_ff_encap(vap, m, hdrspace + meshhdrsize, key);
 		}
 		if (m == NULL)
 #endif
 			goto bad;
 	}
 	datalen = m->m_pkthdr.len;		/* NB: w/o 802.11 header */
 
 	M_PREPEND(m, hdrspace + meshhdrsize, IEEE80211_M_NOWAIT);
 	if (m == NULL) {
 		vap->iv_stats.is_tx_nobuf++;
 		goto bad;
 	}
 	wh = mtod(m, struct ieee80211_frame *);
 	wh->i_fc[0] = IEEE80211_FC0_VERSION_0 | IEEE80211_FC0_TYPE_DATA;
 	*(uint16_t *)wh->i_dur = 0;
 	qos = NULL;	/* NB: quiet compiler */
 	if (is4addr) {
 		wh->i_fc[1] = IEEE80211_FC1_DIR_DSTODS;
 		IEEE80211_ADDR_COPY(wh->i_addr1, ni->ni_macaddr);
 		IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr);
 		IEEE80211_ADDR_COPY(wh->i_addr3, eh.ether_dhost);
 		IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, eh.ether_shost);
 	} else switch (vap->iv_opmode) {
 	case IEEE80211_M_STA:
 		wh->i_fc[1] = IEEE80211_FC1_DIR_TODS;
 		IEEE80211_ADDR_COPY(wh->i_addr1, ni->ni_bssid);
 		IEEE80211_ADDR_COPY(wh->i_addr2, eh.ether_shost);
 		IEEE80211_ADDR_COPY(wh->i_addr3, eh.ether_dhost);
 		break;
 	case IEEE80211_M_IBSS:
 	case IEEE80211_M_AHDEMO:
 		wh->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 		IEEE80211_ADDR_COPY(wh->i_addr1, eh.ether_dhost);
 		IEEE80211_ADDR_COPY(wh->i_addr2, eh.ether_shost);
 		/*
 		 * NB: always use the bssid from iv_bss as the
 		 *     neighbor's may be stale after an ibss merge
 		 */
 		IEEE80211_ADDR_COPY(wh->i_addr3, vap->iv_bss->ni_bssid);
 		break;
 	case IEEE80211_M_HOSTAP:
 		wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS;
 		IEEE80211_ADDR_COPY(wh->i_addr1, eh.ether_dhost);
 		IEEE80211_ADDR_COPY(wh->i_addr2, ni->ni_bssid);
 		IEEE80211_ADDR_COPY(wh->i_addr3, eh.ether_shost);
 		break;
 #ifdef IEEE80211_SUPPORT_MESH
 	case IEEE80211_M_MBSS:
 		/* NB: offset by hdrspace to deal with DATAPAD */
 		mc = (struct ieee80211_meshcntl_ae10 *)
 		     (mtod(m, uint8_t *) + hdrspace);
 		wh->i_fc[1] = dir;
 		switch (meshae) {
 		case IEEE80211_MESH_AE_00:	/* no proxy */
 			mc->mc_flags = 0;
 			if (dir == IEEE80211_FC1_DIR_DSTODS) { /* ucast */
 				IEEE80211_ADDR_COPY(wh->i_addr1,
 				    ni->ni_macaddr);
 				IEEE80211_ADDR_COPY(wh->i_addr2,
 				    vap->iv_myaddr);
 				IEEE80211_ADDR_COPY(wh->i_addr3,
 				    eh.ether_dhost);
 				IEEE80211_ADDR_COPY(WH4(wh)->i_addr4,
 				    eh.ether_shost);
 				qos =((struct ieee80211_qosframe_addr4 *)
 				    wh)->i_qos;
 			} else if (dir == IEEE80211_FC1_DIR_FROMDS) {
 				 /* mcast */
 				IEEE80211_ADDR_COPY(wh->i_addr1,
 				    eh.ether_dhost);
 				IEEE80211_ADDR_COPY(wh->i_addr2,
 				    vap->iv_myaddr);
 				IEEE80211_ADDR_COPY(wh->i_addr3,
 				    eh.ether_shost);
 				qos = ((struct ieee80211_qosframe *)
 				    wh)->i_qos;
 			}
 			break;
 		case IEEE80211_MESH_AE_01:	/* mcast, proxy */
 			wh->i_fc[1] = IEEE80211_FC1_DIR_FROMDS;
 			IEEE80211_ADDR_COPY(wh->i_addr1, eh.ether_dhost);
 			IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr);
 			IEEE80211_ADDR_COPY(wh->i_addr3, vap->iv_myaddr);
 			mc->mc_flags = 1;
 			IEEE80211_ADDR_COPY(MC01(mc)->mc_addr4,
 			    eh.ether_shost);
 			qos = ((struct ieee80211_qosframe *) wh)->i_qos;
 			break;
 		case IEEE80211_MESH_AE_10:	/* ucast, proxy */
 			KASSERT(rt != NULL, ("route is NULL"));
 			IEEE80211_ADDR_COPY(wh->i_addr1, rt->rt_nexthop);
 			IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr);
 			IEEE80211_ADDR_COPY(wh->i_addr3, rt->rt_mesh_gate);
 			IEEE80211_ADDR_COPY(WH4(wh)->i_addr4, vap->iv_myaddr);
 			mc->mc_flags = IEEE80211_MESH_AE_10;
 			IEEE80211_ADDR_COPY(mc->mc_addr5, eh.ether_dhost);
 			IEEE80211_ADDR_COPY(mc->mc_addr6, eh.ether_shost);
 			qos = ((struct ieee80211_qosframe_addr4 *) wh)->i_qos;
 			break;
 		default:
 			KASSERT(0, ("meshae %d", meshae));
 			break;
 		}
 		mc->mc_ttl = ms->ms_ttl;
 		ms->ms_seq++;
 		le32enc(mc->mc_seq, ms->ms_seq);
 		break;
 #endif
 	case IEEE80211_M_WDS:		/* NB: is4addr should always be true */
 	default:
 		goto bad;
 	}
 	if (m->m_flags & M_MORE_DATA)
 		wh->i_fc[1] |= IEEE80211_FC1_MORE_DATA;
 	if (addqos) {
 		int ac, tid;
 
 		if (is4addr) {
 			qos = ((struct ieee80211_qosframe_addr4 *) wh)->i_qos;
 		/* NB: mesh case handled earlier */
 		} else if (vap->iv_opmode != IEEE80211_M_MBSS)
 			qos = ((struct ieee80211_qosframe *) wh)->i_qos;
 		ac = M_WME_GETAC(m);
 		/* map from access class/queue to 11e header priorty value */
 		tid = WME_AC_TO_TID(ac);
 		qos[0] = tid & IEEE80211_QOS_TID;
 		if (ic->ic_wme.wme_wmeChanParams.cap_wmeParams[ac].wmep_noackPolicy)
 			qos[0] |= IEEE80211_QOS_ACKPOLICY_NOACK;
 #ifdef IEEE80211_SUPPORT_MESH
 		if (vap->iv_opmode == IEEE80211_M_MBSS)
 			qos[1] = IEEE80211_QOS_MC;
 		else
 #endif
 			qos[1] = 0;
 		wh->i_fc[0] |= IEEE80211_FC0_SUBTYPE_QOS_DATA;
 
 		/*
 		 * If this is an A-MSDU then ensure we set the
 		 * relevant field.
 		 */
 		if (is_amsdu)
 			qos[0] |= IEEE80211_QOS_AMSDU;
 
 		/*
 		 * XXX TODO TX lock is needed for atomic updates of sequence
 		 * numbers.  If the driver does it, then don't do it here;
 		 * and we don't need the TX lock held.
 		 */
 		if ((m->m_flags & M_AMPDU_MPDU) == 0) {
 			/*
 			 * 802.11-2012 9.3.2.10 -
 			 *
 			 * If this is a multicast frame then we need
 			 * to ensure that the sequence number comes from
 			 * a separate seqno space and not the TID space.
 			 *
 			 * Otherwise multicast frames may actually cause
 			 * holes in the TX blockack window space and
 			 * upset various things.
 			 */
 			if (IEEE80211_IS_MULTICAST(wh->i_addr1))
 				seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++;
 			else
 				seqno = ni->ni_txseqs[tid]++;
 
 			/*
 			 * NB: don't assign a sequence # to potential
 			 * aggregates; we expect this happens at the
 			 * point the frame comes off any aggregation q
 			 * as otherwise we may introduce holes in the
 			 * BA sequence space and/or make window accouting
 			 * more difficult.
 			 *
 			 * XXX may want to control this with a driver
 			 * capability; this may also change when we pull
 			 * aggregation up into net80211
 			 */
 			*(uint16_t *)wh->i_seq =
 			    htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT);
 			M_SEQNO_SET(m, seqno);
 		} else {
 			/* NB: zero out i_seq field (for s/w encryption etc) */
 			*(uint16_t *)wh->i_seq = 0;
 		}
 	} else {
 		/*
 		 * XXX TODO TX lock is needed for atomic updates of sequence
 		 * numbers.  If the driver does it, then don't do it here;
 		 * and we don't need the TX lock held.
 		 */
 		seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++;
 		*(uint16_t *)wh->i_seq =
 		    htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT);
 		M_SEQNO_SET(m, seqno);
 
 		/*
 		 * XXX TODO: we shouldn't allow EAPOL, etc that would
 		 * be forced to be non-QoS traffic to be A-MSDU encapsulated.
 		 */
 		if (is_amsdu)
 			printf("%s: XXX ERROR: is_amsdu set; not QoS!\n",
 			    __func__);
 	}
 
 	/*
 	 * Check if xmit fragmentation is required.
 	 *
 	 * If the hardware does fragmentation offload, then don't bother
 	 * doing it here.
 	 */
 	if (IEEE80211_CONF_FRAG_OFFLOAD(ic))
 		txfrag = 0;
 	else
 		txfrag = (m->m_pkthdr.len > vap->iv_fragthreshold &&
 		    !IEEE80211_IS_MULTICAST(wh->i_addr1) &&
 		    (vap->iv_caps & IEEE80211_C_TXFRAG) &&
 		    (m->m_flags & (M_FF | M_AMPDU_MPDU)) == 0);
 
 	if (key != NULL) {
 		/*
 		 * IEEE 802.1X: send EAPOL frames always in the clear.
 		 * WPA/WPA2: encrypt EAPOL keys when pairwise keys are set.
 		 */
 		if ((m->m_flags & M_EAPOL) == 0 ||
 		    ((vap->iv_flags & IEEE80211_F_WPA) &&
 		     (vap->iv_opmode == IEEE80211_M_STA ?
 		      !IEEE80211_KEY_UNDEFINED(key) :
 		      !IEEE80211_KEY_UNDEFINED(&ni->ni_ucastkey)))) {
 			wh->i_fc[1] |= IEEE80211_FC1_PROTECTED;
 			if (!ieee80211_crypto_enmic(vap, key, m, txfrag)) {
 				IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_OUTPUT,
 				    eh.ether_dhost,
 				    "%s", "enmic failed, discard frame");
 				vap->iv_stats.is_crypto_enmicfail++;
 				goto bad;
 			}
 		}
 	}
 	if (txfrag && !ieee80211_fragment(vap, m, hdrsize,
 	    key != NULL ? key->wk_cipher->ic_header : 0, vap->iv_fragthreshold))
 		goto bad;
 
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	IEEE80211_NODE_STAT(ni, tx_data);
 	if (IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 		IEEE80211_NODE_STAT(ni, tx_mcast);
 		m->m_flags |= M_MCAST;
 	} else
 		IEEE80211_NODE_STAT(ni, tx_ucast);
 	IEEE80211_NODE_STAT_ADD(ni, tx_bytes, datalen);
 
 	return m;
 bad:
 	if (m != NULL)
 		m_freem(m);
 	return NULL;
 #undef WH4
 #undef MC01
 }
 
 void
 ieee80211_free_mbuf(struct mbuf *m)
 {
 	struct mbuf *next;
 
 	if (m == NULL)
 		return;
 
 	do {
 		next = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		m_freem(m);
 	} while ((m = next) != NULL);
 }
 
 /*
  * Fragment the frame according to the specified mtu.
  * The size of the 802.11 header (w/o padding) is provided
  * so we don't need to recalculate it.  We create a new
  * mbuf for each fragment and chain it through m_nextpkt;
  * we might be able to optimize this by reusing the original
  * packet's mbufs but that is significantly more complicated.
  */
 static int
 ieee80211_fragment(struct ieee80211vap *vap, struct mbuf *m0,
 	u_int hdrsize, u_int ciphdrsize, u_int mtu)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_frame *wh, *whf;
 	struct mbuf *m, *prev;
 	u_int totalhdrsize, fragno, fragsize, off, remainder, payload;
 	u_int hdrspace;
 
 	KASSERT(m0->m_nextpkt == NULL, ("mbuf already chained?"));
 	KASSERT(m0->m_pkthdr.len > mtu,
 		("pktlen %u mtu %u", m0->m_pkthdr.len, mtu));
 
 	/*
 	 * Honor driver DATAPAD requirement.
 	 */
 	if (ic->ic_flags & IEEE80211_F_DATAPAD)
 		hdrspace = roundup(hdrsize, sizeof(uint32_t));
 	else
 		hdrspace = hdrsize;
 
 	wh = mtod(m0, struct ieee80211_frame *);
 	/* NB: mark the first frag; it will be propagated below */
 	wh->i_fc[1] |= IEEE80211_FC1_MORE_FRAG;
 	totalhdrsize = hdrspace + ciphdrsize;
 	fragno = 1;
 	off = mtu - ciphdrsize;
 	remainder = m0->m_pkthdr.len - off;
 	prev = m0;
 	do {
 		fragsize = MIN(totalhdrsize + remainder, mtu);
 		m = m_get2(fragsize, IEEE80211_M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			goto bad;
 		/* leave room to prepend any cipher header */
 		m_align(m, fragsize - ciphdrsize);
 
 		/*
 		 * Form the header in the fragment.  Note that since
 		 * we mark the first fragment with the MORE_FRAG bit
 		 * it automatically is propagated to each fragment; we
 		 * need only clear it on the last fragment (done below).
 		 * NB: frag 1+ dont have Mesh Control field present.
 		 */
 		whf = mtod(m, struct ieee80211_frame *);
 		memcpy(whf, wh, hdrsize);
 #ifdef IEEE80211_SUPPORT_MESH
 		if (vap->iv_opmode == IEEE80211_M_MBSS)
 			ieee80211_getqos(wh)[1] &= ~IEEE80211_QOS_MC;
 #endif
 		*(uint16_t *)&whf->i_seq[0] |= htole16(
 			(fragno & IEEE80211_SEQ_FRAG_MASK) <<
 				IEEE80211_SEQ_FRAG_SHIFT);
 		fragno++;
 
 		payload = fragsize - totalhdrsize;
 		/* NB: destination is known to be contiguous */
 
 		m_copydata(m0, off, payload, mtod(m, uint8_t *) + hdrspace);
 		m->m_len = hdrspace + payload;
 		m->m_pkthdr.len = hdrspace + payload;
 		m->m_flags |= M_FRAG;
 
 		/* chain up the fragment */
 		prev->m_nextpkt = m;
 		prev = m;
 
 		/* deduct fragment just formed */
 		remainder -= payload;
 		off += payload;
 	} while (remainder != 0);
 
 	/* set the last fragment */
 	m->m_flags |= M_LASTFRAG;
 	whf->i_fc[1] &= ~IEEE80211_FC1_MORE_FRAG;
 
 	/* strip first mbuf now that everything has been copied */
 	m_adj(m0, -(m0->m_pkthdr.len - (mtu - ciphdrsize)));
 	m0->m_flags |= M_FIRSTFRAG | M_FRAG;
 
 	vap->iv_stats.is_tx_fragframes++;
 	vap->iv_stats.is_tx_frags += fragno-1;
 
 	return 1;
 bad:
 	/* reclaim fragments but leave original frame for caller to free */
 	ieee80211_free_mbuf(m0->m_nextpkt);
 	m0->m_nextpkt = NULL;
 	return 0;
 }
 
 /*
  * Add a supported rates element id to a frame.
  */
 uint8_t *
 ieee80211_add_rates(uint8_t *frm, const struct ieee80211_rateset *rs)
 {
 	int nrates;
 
 	*frm++ = IEEE80211_ELEMID_RATES;
 	nrates = rs->rs_nrates;
 	if (nrates > IEEE80211_RATE_SIZE)
 		nrates = IEEE80211_RATE_SIZE;
 	*frm++ = nrates;
 	memcpy(frm, rs->rs_rates, nrates);
 	return frm + nrates;
 }
 
 /*
  * Add an extended supported rates element id to a frame.
  */
 uint8_t *
 ieee80211_add_xrates(uint8_t *frm, const struct ieee80211_rateset *rs)
 {
 	/*
 	 * Add an extended supported rates element if operating in 11g mode.
 	 */
 	if (rs->rs_nrates > IEEE80211_RATE_SIZE) {
 		int nrates = rs->rs_nrates - IEEE80211_RATE_SIZE;
 		*frm++ = IEEE80211_ELEMID_XRATES;
 		*frm++ = nrates;
 		memcpy(frm, rs->rs_rates + IEEE80211_RATE_SIZE, nrates);
 		frm += nrates;
 	}
 	return frm;
 }
 
 /* 
  * Add an ssid element to a frame.
  */
 uint8_t *
 ieee80211_add_ssid(uint8_t *frm, const uint8_t *ssid, u_int len)
 {
 	*frm++ = IEEE80211_ELEMID_SSID;
 	*frm++ = len;
 	memcpy(frm, ssid, len);
 	return frm + len;
 }
 
 /*
  * Add an erp element to a frame.
  */
 static uint8_t *
 ieee80211_add_erp(uint8_t *frm, struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	uint8_t erp;
 
 	*frm++ = IEEE80211_ELEMID_ERP;
 	*frm++ = 1;
 	erp = 0;
 
 	/*
 	 * TODO:  This uses the global flags for now because
 	 * the per-VAP flags are fine for per-VAP, but don't
 	 * take into account which VAPs share the same channel
 	 * and which are on different channels.
 	 *
 	 * ERP and HT/VHT protection mode is a function of
 	 * how many stations are on a channel, not specifically
 	 * the VAP or global.  But, until we grow that status,
 	 * the global flag will have to do.
 	 */
 	if (ic->ic_flags_ext & IEEE80211_FEXT_NONERP_PR)
 		erp |= IEEE80211_ERP_NON_ERP_PRESENT;
 
 	/*
 	 * TODO: same as above; these should be based not
 	 * on the vap or ic flags, but instead on a combination
 	 * of per-VAP and channels.
 	 */
 	if (ic->ic_flags & IEEE80211_F_USEPROT)
 		erp |= IEEE80211_ERP_USE_PROTECTION;
 	if (ic->ic_flags & IEEE80211_F_USEBARKER)
 		erp |= IEEE80211_ERP_LONG_PREAMBLE;
 	*frm++ = erp;
 	return frm;
 }
 
 /*
  * Add a CFParams element to a frame.
  */
 static uint8_t *
 ieee80211_add_cfparms(uint8_t *frm, struct ieee80211com *ic)
 {
 #define	ADDSHORT(frm, v) do {	\
 	le16enc(frm, v);	\
 	frm += 2;		\
 } while (0)
 	*frm++ = IEEE80211_ELEMID_CFPARMS;
 	*frm++ = 6;
 	*frm++ = 0;		/* CFP count */
 	*frm++ = 2;		/* CFP period */
 	ADDSHORT(frm, 0);	/* CFP MaxDuration (TU) */
 	ADDSHORT(frm, 0);	/* CFP CurRemaining (TU) */
 	return frm;
 #undef ADDSHORT
 }
 
 static __inline uint8_t *
 add_appie(uint8_t *frm, const struct ieee80211_appie *ie)
 {
 	memcpy(frm, ie->ie_data, ie->ie_len);
 	return frm + ie->ie_len;
 }
 
 static __inline uint8_t *
 add_ie(uint8_t *frm, const uint8_t *ie)
 {
 	memcpy(frm, ie, 2 + ie[1]);
 	return frm + 2 + ie[1];
 }
 
 #define	WME_OUI_BYTES		0x00, 0x50, 0xf2
 /*
  * Add a WME information element to a frame.
  */
 uint8_t *
 ieee80211_add_wme_info(uint8_t *frm, struct ieee80211_wme_state *wme,
     struct ieee80211_node *ni)
 {
 	static const uint8_t oui[4] = { WME_OUI_BYTES, WME_OUI_TYPE };
 	struct ieee80211vap *vap = ni->ni_vap;
 
 	*frm++ = IEEE80211_ELEMID_VENDOR;
 	*frm++ = sizeof(struct ieee80211_wme_info) - 2;
 	memcpy(frm, oui, sizeof(oui));
 	frm += sizeof(oui);
 	*frm++ = WME_INFO_OUI_SUBTYPE;
 	*frm++ = WME_VERSION;
 
 	/* QoS info field depends upon operating mode */
 	switch (vap->iv_opmode) {
 	case IEEE80211_M_HOSTAP:
 		*frm = wme->wme_bssChanParams.cap_info;
 		if (vap->iv_flags_ext & IEEE80211_FEXT_UAPSD)
 			*frm |= WME_CAPINFO_UAPSD_EN;
 		frm++;
 		break;
 	case IEEE80211_M_STA:
 		/*
 		 * NB: UAPSD drivers must set this up in their
 		 * VAP creation method.
 		 */
 		*frm++ = vap->iv_uapsdinfo;
 		break;
 	default:
 		*frm++ = 0;
 		break;
 	}
 
 	return frm;
 }
 
 /*
  * Add a WME parameters element to a frame.
  */
 static uint8_t *
 ieee80211_add_wme_param(uint8_t *frm, struct ieee80211_wme_state *wme,
     int uapsd_enable)
 {
 #define	ADDSHORT(frm, v) do {	\
 	le16enc(frm, v);	\
 	frm += 2;		\
 } while (0)
 	/* NB: this works 'cuz a param has an info at the front */
 	static const struct ieee80211_wme_info param = {
 		.wme_id		= IEEE80211_ELEMID_VENDOR,
 		.wme_len	= sizeof(struct ieee80211_wme_param) - 2,
 		.wme_oui	= { WME_OUI_BYTES },
 		.wme_type	= WME_OUI_TYPE,
 		.wme_subtype	= WME_PARAM_OUI_SUBTYPE,
 		.wme_version	= WME_VERSION,
 	};
 	int i;
 
 	memcpy(frm, &param, sizeof(param));
 	frm += __offsetof(struct ieee80211_wme_info, wme_info);
 	*frm = wme->wme_bssChanParams.cap_info;	/* AC info */
 	if (uapsd_enable)
 		*frm |= WME_CAPINFO_UAPSD_EN;
 	frm++;
 	*frm++ = 0;					/* reserved field */
 	/* XXX TODO - U-APSD bits - SP, flags below */
 	for (i = 0; i < WME_NUM_AC; i++) {
 		const struct wmeParams *ac =
 		       &wme->wme_bssChanParams.cap_wmeParams[i];
 		*frm++ = _IEEE80211_SHIFTMASK(i, WME_PARAM_ACI)
 		       | _IEEE80211_SHIFTMASK(ac->wmep_acm, WME_PARAM_ACM)
 		       | _IEEE80211_SHIFTMASK(ac->wmep_aifsn, WME_PARAM_AIFSN)
 		       ;
 		*frm++ = _IEEE80211_SHIFTMASK(ac->wmep_logcwmax,
 			    WME_PARAM_LOGCWMAX)
 		       | _IEEE80211_SHIFTMASK(ac->wmep_logcwmin,
 			    WME_PARAM_LOGCWMIN)
 		       ;
 		ADDSHORT(frm, ac->wmep_txopLimit);
 	}
 	return frm;
 #undef ADDSHORT
 }
 #undef WME_OUI_BYTES
 
 /*
  * Add an 11h Power Constraint element to a frame.
  */
 static uint8_t *
 ieee80211_add_powerconstraint(uint8_t *frm, struct ieee80211vap *vap)
 {
 	const struct ieee80211_channel *c = vap->iv_bss->ni_chan;
 	/* XXX per-vap tx power limit? */
 	int8_t limit = vap->iv_ic->ic_txpowlimit / 2;
 
 	frm[0] = IEEE80211_ELEMID_PWRCNSTR;
 	frm[1] = 1;
 	frm[2] = c->ic_maxregpower > limit ?  c->ic_maxregpower - limit : 0;
 	return frm + 3;
 }
 
 /*
  * Add an 11h Power Capability element to a frame.
  */
 static uint8_t *
 ieee80211_add_powercapability(uint8_t *frm, const struct ieee80211_channel *c)
 {
 	frm[0] = IEEE80211_ELEMID_PWRCAP;
 	frm[1] = 2;
 	frm[2] = c->ic_minpower;
 	frm[3] = c->ic_maxpower;
 	return frm + 4;
 }
 
 /*
  * Add an 11h Supported Channels element to a frame.
  */
 static uint8_t *
 ieee80211_add_supportedchannels(uint8_t *frm, struct ieee80211com *ic)
 {
 	static const int ielen = 26;
 
 	frm[0] = IEEE80211_ELEMID_SUPPCHAN;
 	frm[1] = ielen;
 	/* XXX not correct */
 	memcpy(frm+2, ic->ic_chan_avail, ielen);
 	return frm + 2 + ielen;
 }
 
 /*
  * Add an 11h Quiet time element to a frame.
  */
 static uint8_t *
 ieee80211_add_quiet(uint8_t *frm, struct ieee80211vap *vap, int update)
 {
 	struct ieee80211_quiet_ie *quiet = (struct ieee80211_quiet_ie *) frm;
 
 	quiet->quiet_ie = IEEE80211_ELEMID_QUIET;
 	quiet->len = 6;
 
 	/*
 	 * Only update every beacon interval - otherwise probe responses
 	 * would update the quiet count value.
 	 */
 	if (update) {
 		if (vap->iv_quiet_count_value == 1)
 			vap->iv_quiet_count_value = vap->iv_quiet_count;
 		else if (vap->iv_quiet_count_value > 1)
 			vap->iv_quiet_count_value--;
 	}
 
 	if (vap->iv_quiet_count_value == 0) {
 		/* value 0 is reserved as per 802.11h standerd */
 		vap->iv_quiet_count_value = 1;
 	}
 
 	quiet->tbttcount = vap->iv_quiet_count_value;
 	quiet->period = vap->iv_quiet_period;
 	quiet->duration = htole16(vap->iv_quiet_duration);
 	quiet->offset = htole16(vap->iv_quiet_offset);
 	return frm + sizeof(*quiet);
 }
 
 /*
  * Add an 11h Channel Switch Announcement element to a frame.
  * Note that we use the per-vap CSA count to adjust the global
  * counter so we can use this routine to form probe response
  * frames and get the current count.
  */
 static uint8_t *
 ieee80211_add_csa(uint8_t *frm, struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_csa_ie *csa = (struct ieee80211_csa_ie *) frm;
 
 	csa->csa_ie = IEEE80211_ELEMID_CSA;
 	csa->csa_len = 3;
 	csa->csa_mode = 1;		/* XXX force quiet on channel */
 	csa->csa_newchan = ieee80211_chan2ieee(ic, ic->ic_csa_newchan);
 	csa->csa_count = ic->ic_csa_count - vap->iv_csa_count;
 	return frm + sizeof(*csa);
 }
 
 /*
  * Add an 11h country information element to a frame.
  */
 static uint8_t *
 ieee80211_add_countryie(uint8_t *frm, struct ieee80211com *ic)
 {
 
 	if (ic->ic_countryie == NULL ||
 	    ic->ic_countryie_chan != ic->ic_bsschan) {
 		/*
 		 * Handle lazy construction of ie.  This is done on
 		 * first use and after a channel change that requires
 		 * re-calculation.
 		 */
 		if (ic->ic_countryie != NULL)
 			IEEE80211_FREE(ic->ic_countryie, M_80211_NODE_IE);
 		ic->ic_countryie = ieee80211_alloc_countryie(ic);
 		if (ic->ic_countryie == NULL)
 			return frm;
 		ic->ic_countryie_chan = ic->ic_bsschan;
 	}
 	return add_appie(frm, ic->ic_countryie);
 }
 
 uint8_t *
 ieee80211_add_wpa(uint8_t *frm, const struct ieee80211vap *vap)
 {
 	if (vap->iv_flags & IEEE80211_F_WPA1 && vap->iv_wpa_ie != NULL)
 		return (add_ie(frm, vap->iv_wpa_ie));
 	else {
 		/* XXX else complain? */
 		return (frm);
 	}
 }
 
 uint8_t *
 ieee80211_add_rsn(uint8_t *frm, const struct ieee80211vap *vap)
 {
 	if (vap->iv_flags & IEEE80211_F_WPA2 && vap->iv_rsn_ie != NULL)
 		return (add_ie(frm, vap->iv_rsn_ie));
 	else {
 		/* XXX else complain? */
 		return (frm);
 	}
 }
 
 uint8_t *
 ieee80211_add_qos(uint8_t *frm, const struct ieee80211_node *ni)
 {
 	if (ni->ni_flags & IEEE80211_NODE_QOS) {
 		*frm++ = IEEE80211_ELEMID_QOS;
 		*frm++ = 1;
 		*frm++ = 0;
 	}
 
 	return (frm);
 }
 
 /*
  * ieee80211_send_probereq(): send a probe request frame with the specified ssid
  * and any optional information element data;  some helper functions as FW based
  * HW scans need some of that information passed too.
  */
 static uint32_t
 ieee80211_probereq_ie_len(struct ieee80211vap *vap, struct ieee80211com *ic)
 {
 	const struct ieee80211_rateset *rs;
 
 	rs = ieee80211_get_suprates(ic, ic->ic_curchan);
 
 	/*
 	 * prreq frame format
 	 *	[tlv] ssid
 	 *	[tlv] supported rates
 	 *	[tlv] extended supported rates (if needed)
 	 *	[tlv] HT cap (optional)
 	 *	[tlv] VHT cap (optional)
 	 *	[tlv] WPA (optional)
 	 *	[tlv] user-specified ie's
 	 */
 	return ( 2 + IEEE80211_NWID_LEN
 	       + 2 + IEEE80211_RATE_SIZE
 	       + ((rs->rs_nrates > IEEE80211_RATE_SIZE) ?
 	           2 + (rs->rs_nrates - IEEE80211_RATE_SIZE) : 0)
 	       + (((vap->iv_opmode == IEEE80211_M_IBSS) &&
 		    (vap->iv_flags_ht & IEEE80211_FHT_HT)) ?
 	                sizeof(struct ieee80211_ie_htcap) : 0)
 #ifdef notyet
 	       + sizeof(struct ieee80211_ie_htinfo)	/* XXX not needed? */
 	       + sizeof(struct ieee80211_ie_vhtcap)
 #endif
 	       + ((vap->iv_flags & IEEE80211_F_WPA1 && vap->iv_wpa_ie != NULL) ?
 	           vap->iv_wpa_ie[1] : 0)
 	       + (vap->iv_appie_probereq != NULL ?
 		   vap->iv_appie_probereq->ie_len : 0)
 	);
 }
 
 int
 ieee80211_probereq_ie(struct ieee80211vap *vap, struct ieee80211com *ic,
     uint8_t **frmp, uint32_t *frmlen, const uint8_t *ssid, size_t ssidlen,
     bool alloc)
 {
 	const struct ieee80211_rateset *rs;
 	uint8_t	*frm;
 	uint32_t len;
 
 	if (!alloc && (frmp == NULL || frmlen == NULL))
 		return (EINVAL);
 
 	len = ieee80211_probereq_ie_len(vap, ic);
 	if (!alloc && len > *frmlen)
 		return (ENOBUFS);
 
 	/* For HW scans we usually do not pass in the SSID as IE. */
 	if (ssidlen == -1)
 		len -= (2 + IEEE80211_NWID_LEN);
 
 	if (alloc) {
 		frm = IEEE80211_MALLOC(len, M_80211_VAP,
 		    IEEE80211_M_WAITOK | IEEE80211_M_ZERO);
 		*frmp = frm;
 		*frmlen = len;
 	} else
 		frm = *frmp;
 
 	if (ssidlen != -1)
 		frm = ieee80211_add_ssid(frm, ssid, ssidlen);
 	rs = ieee80211_get_suprates(ic, ic->ic_curchan);
 	frm = ieee80211_add_rates(frm, rs);
 	frm = ieee80211_add_xrates(frm, rs);
 
 	/*
 	 * Note: we can't use bss; we don't have one yet.
 	 *
 	 * So, we should announce our capabilities
 	 * in this channel mode (2g/5g), not the
 	 * channel details itself.
 	 */
 	if ((vap->iv_opmode == IEEE80211_M_IBSS) &&
 	    (vap->iv_flags_ht & IEEE80211_FHT_HT)) {
 		struct ieee80211_channel *c;
 
 		/*
 		 * Get the HT channel that we should try upgrading to.
 		 * If we can do 40MHz then this'll upgrade it appropriately.
 		 */
 		c = ieee80211_ht_adjust_channel(ic, ic->ic_curchan,
 		    vap->iv_flags_ht);
 		frm = ieee80211_add_htcap_ch(frm, vap, c);
 	}
 
 	/*
 	 * XXX TODO: need to figure out what/how to update the
 	 * VHT channel.
 	 */
 #ifdef notyet
 	if (vap->iv_flags_vht & IEEE80211_FVHT_VHT) {
 		struct ieee80211_channel *c;
 
 		c = ieee80211_ht_adjust_channel(ic, ic->ic_curchan,
 		    vap->iv_flags_ht);
 		c = ieee80211_vht_adjust_channel(ic, c, vap->iv_flags_vht);
 		frm = ieee80211_add_vhtcap_ch(frm, vap, c);
 	}
 #endif
 
 	frm = ieee80211_add_wpa(frm, vap);
 	if (vap->iv_appie_probereq != NULL)
 		frm = add_appie(frm, vap->iv_appie_probereq);
 
 	if (!alloc) {
 		*frmp = frm;
 		*frmlen = len;
 	}
 
 	return (0);
 }
 
 int
 ieee80211_send_probereq(struct ieee80211_node *ni,
 	const uint8_t sa[IEEE80211_ADDR_LEN],
 	const uint8_t da[IEEE80211_ADDR_LEN],
 	const uint8_t bssid[IEEE80211_ADDR_LEN],
 	const uint8_t *ssid, size_t ssidlen)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_node *bss;
 	const struct ieee80211_txparam *tp;
 	struct ieee80211_bpf_params params;
 	struct mbuf *m;
 	uint8_t *frm;
 	uint32_t frmlen;
 	int ret;
 
 	bss = ieee80211_ref_node(vap->iv_bss);
 
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT, ni,
 		    "block %s frame in CAC state", "probe request");
 		vap->iv_stats.is_tx_badstate++;
 		ieee80211_free_node(bss);
 		return EIO;		/* XXX */
 	}
 
 	/*
 	 * Hold a reference on the node so it doesn't go away until after
 	 * the xmit is complete all the way in the driver.  On error we
 	 * will remove our reference.
 	 */
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 		"ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n",
 		__func__, __LINE__,
 		ni, ether_sprintf(ni->ni_macaddr),
 		ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	/* See comments above for entire frame format. */
 	frmlen = ieee80211_probereq_ie_len(vap, ic);
 	m = ieee80211_getmgtframe(&frm,
 	    ic->ic_headroom + sizeof(struct ieee80211_frame), frmlen);
 	if (m == NULL) {
 		vap->iv_stats.is_tx_nobuf++;
 		ieee80211_free_node(ni);
 		ieee80211_free_node(bss);
 		return ENOMEM;
 	}
 
 	ret = ieee80211_probereq_ie(vap, ic, &frm, &frmlen, ssid, ssidlen,
 	    false);
 	KASSERT(ret == 0,
 	    ("%s: ieee80211_probereq_ie failed: %d\n", __func__, ret));
 
 	m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 	KASSERT(M_LEADINGSPACE(m) >= sizeof(struct ieee80211_frame),
 	    ("leading space %zd", M_LEADINGSPACE(m)));
 	M_PREPEND(m, sizeof(struct ieee80211_frame), IEEE80211_M_NOWAIT);
 	if (m == NULL) {
 		/* NB: cannot happen */
 		ieee80211_free_node(ni);
 		ieee80211_free_node(bss);
 		return ENOMEM;
 	}
 
 	IEEE80211_TX_LOCK(ic);
 	ieee80211_send_setup(ni, m,
 	     IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_PROBE_REQ,
 	     IEEE80211_NONQOS_TID, sa, da, bssid);
 	/* XXX power management? */
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	M_WME_SETAC(m, WME_AC_BE);
 
 	IEEE80211_NODE_STAT(ni, tx_probereq);
 	IEEE80211_NODE_STAT(ni, tx_mgmt);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_DUMPPKTS,
 	    "send probe req on channel %u bssid %s sa %6D da %6D ssid \"%.*s\"\n",
 	    ieee80211_chan2ieee(ic, ic->ic_curchan),
 	    ether_sprintf(bssid),
 	    sa, ":",
 	    da, ":",
 	    ssidlen, ssid);
 
 	memset(&params, 0, sizeof(params));
 	params.ibp_pri = M_WME_GETAC(m);
 	tp = &vap->iv_txparms[ieee80211_chan2mode(ic->ic_curchan)];
 	params.ibp_rate0 = tp->mgmtrate;
 	if (IEEE80211_IS_MULTICAST(da)) {
 		params.ibp_flags |= IEEE80211_BPF_NOACK;
 		params.ibp_try0 = 1;
 	} else
 		params.ibp_try0 = tp->maxretry;
 	params.ibp_power = ni->ni_txpower;
 	ret = ieee80211_raw_output(vap, ni, m, &params);
 	IEEE80211_TX_UNLOCK(ic);
 	ieee80211_free_node(bss);
 	return (ret);
 }
 
 /*
  * Calculate capability information for mgt frames.
  */
 uint16_t
 ieee80211_getcapinfo(struct ieee80211vap *vap, struct ieee80211_channel *chan)
 {
 	uint16_t capinfo;
 
 	KASSERT(vap->iv_opmode != IEEE80211_M_STA, ("station mode"));
 
 	if (vap->iv_opmode == IEEE80211_M_HOSTAP)
 		capinfo = IEEE80211_CAPINFO_ESS;
 	else if (vap->iv_opmode == IEEE80211_M_IBSS)
 		capinfo = IEEE80211_CAPINFO_IBSS;
 	else
 		capinfo = 0;
 	if (vap->iv_flags & IEEE80211_F_PRIVACY)
 		capinfo |= IEEE80211_CAPINFO_PRIVACY;
 	if ((vap->iv_flags & IEEE80211_F_SHPREAMBLE) &&
 	    IEEE80211_IS_CHAN_2GHZ(chan))
 		capinfo |= IEEE80211_CAPINFO_SHORT_PREAMBLE;
 	if (vap->iv_flags & IEEE80211_F_SHSLOT)
 		capinfo |= IEEE80211_CAPINFO_SHORT_SLOTTIME;
 	if (IEEE80211_IS_CHAN_5GHZ(chan) && (vap->iv_flags & IEEE80211_F_DOTH))
 		capinfo |= IEEE80211_CAPINFO_SPECTRUM_MGMT;
 	return capinfo;
 }
 
 /*
  * Send a management frame.  The node is for the destination (or ic_bss
  * when in station mode).  Nodes other than ic_bss have their reference
  * count bumped to reflect our use for an indeterminant time.
  */
 int
 ieee80211_send_mgmt(struct ieee80211_node *ni, int type, int arg)
 {
 #define	HTFLAGS (IEEE80211_NODE_HT | IEEE80211_NODE_HTCOMPAT)
 #define	senderr(_x, _v)	do { vap->iv_stats._v++; ret = _x; goto bad; } while (0)
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_node *bss = vap->iv_bss;
 	struct ieee80211_bpf_params params;
 	struct mbuf *m;
 	uint8_t *frm;
 	uint16_t capinfo;
 	int has_challenge, is_shared_key, ret, status;
 
 	KASSERT(ni != NULL, ("null node"));
 
 	/*
 	 * Hold a reference on the node so it doesn't go away until after
 	 * the xmit is complete all the way in the driver.  On error we
 	 * will remove our reference.
 	 */
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 		"ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n",
 		__func__, __LINE__,
 		ni, ether_sprintf(ni->ni_macaddr),
 		ieee80211_node_refcnt(ni)+1);
 	ieee80211_ref_node(ni);
 
 	memset(&params, 0, sizeof(params));
 	switch (type) {
 	case IEEE80211_FC0_SUBTYPE_AUTH:
 		status = arg >> 16;
 		arg &= 0xffff;
 		has_challenge = ((arg == IEEE80211_AUTH_SHARED_CHALLENGE ||
 		    arg == IEEE80211_AUTH_SHARED_RESPONSE) &&
 		    ni->ni_challenge != NULL);
 
 		/*
 		 * Deduce whether we're doing open authentication or
 		 * shared key authentication.  We do the latter if
 		 * we're in the middle of a shared key authentication
 		 * handshake or if we're initiating an authentication
 		 * request and configured to use shared key.
 		 */
 		is_shared_key = has_challenge ||
 		     arg >= IEEE80211_AUTH_SHARED_RESPONSE ||
 		     (arg == IEEE80211_AUTH_SHARED_REQUEST &&
 		      bss->ni_authmode == IEEE80211_AUTH_SHARED);
 
 		m = ieee80211_getmgtframe(&frm,
 			  ic->ic_headroom + sizeof(struct ieee80211_frame),
 			  3 * sizeof(uint16_t)
 			+ (has_challenge && status == IEEE80211_STATUS_SUCCESS ?
 				sizeof(uint16_t)+IEEE80211_CHALLENGE_LEN : 0));
 		if (m == NULL)
 			senderr(ENOMEM, is_tx_nobuf);
 
 		((uint16_t *)frm)[0] =
 		    (is_shared_key) ? htole16(IEEE80211_AUTH_ALG_SHARED)
 		                    : htole16(IEEE80211_AUTH_ALG_OPEN);
 		((uint16_t *)frm)[1] = htole16(arg);	/* sequence number */
 		((uint16_t *)frm)[2] = htole16(status);/* status */
 
 		if (has_challenge && status == IEEE80211_STATUS_SUCCESS) {
 			((uint16_t *)frm)[3] =
 			    htole16((IEEE80211_CHALLENGE_LEN << 8) |
 			    IEEE80211_ELEMID_CHALLENGE);
 			memcpy(&((uint16_t *)frm)[4], ni->ni_challenge,
 			    IEEE80211_CHALLENGE_LEN);
 			m->m_pkthdr.len = m->m_len =
 				4 * sizeof(uint16_t) + IEEE80211_CHALLENGE_LEN;
 			if (arg == IEEE80211_AUTH_SHARED_RESPONSE) {
 				IEEE80211_NOTE(vap, IEEE80211_MSG_AUTH, ni,
 				    "request encrypt frame (%s)", __func__);
 				/* mark frame for encryption */
 				params.ibp_flags |= IEEE80211_BPF_CRYPTO;
 			}
 		} else
 			m->m_pkthdr.len = m->m_len = 3 * sizeof(uint16_t);
 
 		/* XXX not right for shared key */
 		if (status == IEEE80211_STATUS_SUCCESS)
 			IEEE80211_NODE_STAT(ni, tx_auth);
 		else
 			IEEE80211_NODE_STAT(ni, tx_auth_fail);
 
 		if (vap->iv_opmode == IEEE80211_M_STA)
 			ieee80211_add_callback(m, ieee80211_tx_mgt_cb,
 				(void *) vap->iv_state);
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_DEAUTH:
 		IEEE80211_NOTE(vap, IEEE80211_MSG_AUTH, ni,
 		    "send station deauthenticate (reason: %d (%s))", arg,
 		    ieee80211_reason_to_string(arg));
 		m = ieee80211_getmgtframe(&frm,
 			ic->ic_headroom + sizeof(struct ieee80211_frame),
 			sizeof(uint16_t));
 		if (m == NULL)
 			senderr(ENOMEM, is_tx_nobuf);
 		*(uint16_t *)frm = htole16(arg);	/* reason */
 		m->m_pkthdr.len = m->m_len = sizeof(uint16_t);
 
 		IEEE80211_NODE_STAT(ni, tx_deauth);
 		IEEE80211_NODE_STAT_SET(ni, tx_deauth_code, arg);
 
 		ieee80211_node_unauthorize(ni);		/* port closed */
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_REQ:
 		/*
 		 * asreq frame format
 		 *	[2] capability information
 		 *	[2] listen interval
 		 *	[6*] current AP address (reassoc only)
 		 *	[tlv] ssid
 		 *	[tlv] supported rates
 		 *	[tlv] extended supported rates
 		 *	[4] power capability (optional)
 		 *	[28] supported channels (optional)
 		 *	[tlv] HT capabilities
 		 *	[tlv] VHT capabilities
 		 *	[tlv] WME (optional)
 		 *	[tlv] Vendor OUI HT capabilities (optional)
 		 *	[tlv] Atheros capabilities (if negotiated)
 		 *	[tlv] AppIE's (optional)
 		 */
 		m = ieee80211_getmgtframe(&frm,
 			 ic->ic_headroom + sizeof(struct ieee80211_frame),
 			 sizeof(uint16_t)
 		       + sizeof(uint16_t)
 		       + IEEE80211_ADDR_LEN
 		       + 2 + IEEE80211_NWID_LEN
 		       + 2 + IEEE80211_RATE_SIZE
 		       + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 		       + 4
 		       + 2 + 26
 		       + sizeof(struct ieee80211_wme_info)
 		       + sizeof(struct ieee80211_ie_htcap)
 		       + sizeof(struct ieee80211_ie_vhtcap)
 		       + 4 + sizeof(struct ieee80211_ie_htcap)
 #ifdef IEEE80211_SUPPORT_SUPERG
 		       + sizeof(struct ieee80211_ath_ie)
 #endif
 		       + (vap->iv_appie_wpa != NULL ?
 				vap->iv_appie_wpa->ie_len : 0)
 		       + (vap->iv_appie_assocreq != NULL ?
 				vap->iv_appie_assocreq->ie_len : 0)
 		);
 		if (m == NULL)
 			senderr(ENOMEM, is_tx_nobuf);
 
 		KASSERT(vap->iv_opmode == IEEE80211_M_STA,
 		    ("wrong mode %u", vap->iv_opmode));
 		capinfo = IEEE80211_CAPINFO_ESS;
 		if (vap->iv_flags & IEEE80211_F_PRIVACY)
 			capinfo |= IEEE80211_CAPINFO_PRIVACY;
 		/*
 		 * NB: Some 11a AP's reject the request when
 		 *     short preamble is set.
 		 */
 		if ((vap->iv_flags & IEEE80211_F_SHPREAMBLE) &&
 		    IEEE80211_IS_CHAN_2GHZ(ic->ic_curchan))
 			capinfo |= IEEE80211_CAPINFO_SHORT_PREAMBLE;
 		if (IEEE80211_IS_CHAN_ANYG(ic->ic_curchan) &&
 		    (ic->ic_caps & IEEE80211_C_SHSLOT))
 			capinfo |= IEEE80211_CAPINFO_SHORT_SLOTTIME;
 		if ((ni->ni_capinfo & IEEE80211_CAPINFO_SPECTRUM_MGMT) &&
 		    (vap->iv_flags & IEEE80211_F_DOTH))
 			capinfo |= IEEE80211_CAPINFO_SPECTRUM_MGMT;
 		*(uint16_t *)frm = htole16(capinfo);
 		frm += 2;
 
 		KASSERT(bss->ni_intval != 0, ("beacon interval is zero!"));
 		*(uint16_t *)frm = htole16(howmany(ic->ic_lintval,
 						    bss->ni_intval));
 		frm += 2;
 
 		if (type == IEEE80211_FC0_SUBTYPE_REASSOC_REQ) {
 			IEEE80211_ADDR_COPY(frm, bss->ni_bssid);
 			frm += IEEE80211_ADDR_LEN;
 		}
 
 		frm = ieee80211_add_ssid(frm, ni->ni_essid, ni->ni_esslen);
 		frm = ieee80211_add_rates(frm, &ni->ni_rates);
 		frm = ieee80211_add_rsn(frm, vap);
 		frm = ieee80211_add_xrates(frm, &ni->ni_rates);
 		if (capinfo & IEEE80211_CAPINFO_SPECTRUM_MGMT) {
 			frm = ieee80211_add_powercapability(frm,
 			    ic->ic_curchan);
 			frm = ieee80211_add_supportedchannels(frm, ic);
 		}
 
 		/*
 		 * Check the channel - we may be using an 11n NIC with an
 		 * 11n capable station, but we're configured to be an 11b
 		 * channel.
 		 */
 		if ((vap->iv_flags_ht & IEEE80211_FHT_HT) &&
 		    IEEE80211_IS_CHAN_HT(ni->ni_chan) &&
 		    ni->ni_ies.htcap_ie != NULL &&
 		    ni->ni_ies.htcap_ie[0] == IEEE80211_ELEMID_HTCAP) {
 			frm = ieee80211_add_htcap(frm, ni);
 		}
 
 		if ((vap->iv_flags_vht & IEEE80211_FVHT_VHT) &&
 		    IEEE80211_IS_CHAN_VHT(ni->ni_chan) &&
 		    ni->ni_ies.vhtcap_ie != NULL &&
 		    ni->ni_ies.vhtcap_ie[0] == IEEE80211_ELEMID_VHT_CAP) {
 			frm = ieee80211_add_vhtcap(frm, ni);
 		}
 
 		frm = ieee80211_add_wpa(frm, vap);
 		if ((vap->iv_flags & IEEE80211_F_WME) &&
 		    ni->ni_ies.wme_ie != NULL)
 			frm = ieee80211_add_wme_info(frm, &ic->ic_wme, ni);
 
 		/*
 		 * Same deal - only send HT info if we're on an 11n
 		 * capable channel.
 		 */
 		if ((vap->iv_flags_ht & IEEE80211_FHT_HT) &&
 		    IEEE80211_IS_CHAN_HT(ni->ni_chan) &&
 		    ni->ni_ies.htcap_ie != NULL &&
 		    ni->ni_ies.htcap_ie[0] == IEEE80211_ELEMID_VENDOR) {
 			frm = ieee80211_add_htcap_vendor(frm, ni);
 		}
 #ifdef IEEE80211_SUPPORT_SUPERG
 		if (IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS)) {
 			frm = ieee80211_add_ath(frm, 
 				IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS),
 				((vap->iv_flags & IEEE80211_F_WPA) == 0 &&
 				 ni->ni_authmode != IEEE80211_AUTH_8021X) ?
 				vap->iv_def_txkey : IEEE80211_KEYIX_NONE);
 		}
 #endif /* IEEE80211_SUPPORT_SUPERG */
 		if (vap->iv_appie_assocreq != NULL)
 			frm = add_appie(frm, vap->iv_appie_assocreq);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 
 		ieee80211_add_callback(m, ieee80211_tx_mgt_cb,
 			(void *) vap->iv_state);
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_RESP:
 		/*
 		 * asresp frame format
 		 *	[2] capability information
 		 *	[2] status
 		 *	[2] association ID
 		 *	[tlv] supported rates
 		 *	[tlv] extended supported rates
 		 *	[tlv] HT capabilities (standard, if STA enabled)
 		 *	[tlv] HT information (standard, if STA enabled)
 		 *	[tlv] VHT capabilities (standard, if STA enabled)
 		 *	[tlv] VHT information (standard, if STA enabled)
 		 *	[tlv] WME (if configured and STA enabled)
 		 *	[tlv] HT capabilities (vendor OUI, if STA enabled)
 		 *	[tlv] HT information (vendor OUI, if STA enabled)
 		 *	[tlv] Atheros capabilities (if STA enabled)
 		 *	[tlv] AppIE's (optional)
 		 */
 		m = ieee80211_getmgtframe(&frm,
 			 ic->ic_headroom + sizeof(struct ieee80211_frame),
 			 sizeof(uint16_t)
 		       + sizeof(uint16_t)
 		       + sizeof(uint16_t)
 		       + 2 + IEEE80211_RATE_SIZE
 		       + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 		       + sizeof(struct ieee80211_ie_htcap) + 4
 		       + sizeof(struct ieee80211_ie_htinfo) + 4
 		       + sizeof(struct ieee80211_ie_vhtcap)
 		       + sizeof(struct ieee80211_ie_vht_operation)
 		       + sizeof(struct ieee80211_wme_param)
 #ifdef IEEE80211_SUPPORT_SUPERG
 		       + sizeof(struct ieee80211_ath_ie)
 #endif
 		       + (vap->iv_appie_assocresp != NULL ?
 				vap->iv_appie_assocresp->ie_len : 0)
 		);
 		if (m == NULL)
 			senderr(ENOMEM, is_tx_nobuf);
 
 		capinfo = ieee80211_getcapinfo(vap, bss->ni_chan);
 		*(uint16_t *)frm = htole16(capinfo);
 		frm += 2;
 
 		*(uint16_t *)frm = htole16(arg);	/* status */
 		frm += 2;
 
 		if (arg == IEEE80211_STATUS_SUCCESS) {
 			*(uint16_t *)frm = htole16(ni->ni_associd);
 			IEEE80211_NODE_STAT(ni, tx_assoc);
 		} else
 			IEEE80211_NODE_STAT(ni, tx_assoc_fail);
 		frm += 2;
 
 		frm = ieee80211_add_rates(frm, &ni->ni_rates);
 		frm = ieee80211_add_xrates(frm, &ni->ni_rates);
 		/* NB: respond according to what we received */
 		if ((ni->ni_flags & HTFLAGS) == IEEE80211_NODE_HT) {
 			frm = ieee80211_add_htcap(frm, ni);
 			frm = ieee80211_add_htinfo(frm, ni);
 		}
 		if ((vap->iv_flags & IEEE80211_F_WME) &&
 		    ni->ni_ies.wme_ie != NULL)
 			frm = ieee80211_add_wme_param(frm, &ic->ic_wme,
 			    !! (vap->iv_flags_ext & IEEE80211_FEXT_UAPSD));
 		if ((ni->ni_flags & HTFLAGS) == HTFLAGS) {
 			frm = ieee80211_add_htcap_vendor(frm, ni);
 			frm = ieee80211_add_htinfo_vendor(frm, ni);
 		}
 		if (ni->ni_flags & IEEE80211_NODE_VHT) {
 			frm = ieee80211_add_vhtcap(frm, ni);
 			frm = ieee80211_add_vhtinfo(frm, ni);
 		}
 #ifdef IEEE80211_SUPPORT_SUPERG
 		if (IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS))
 			frm = ieee80211_add_ath(frm, 
 				IEEE80211_ATH_CAP(vap, ni, IEEE80211_F_ATHEROS),
 				((vap->iv_flags & IEEE80211_F_WPA) == 0 &&
 				 ni->ni_authmode != IEEE80211_AUTH_8021X) ?
 				vap->iv_def_txkey : IEEE80211_KEYIX_NONE);
 #endif /* IEEE80211_SUPPORT_SUPERG */
 		if (vap->iv_appie_assocresp != NULL)
 			frm = add_appie(frm, vap->iv_appie_assocresp);
 		m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_DISASSOC:
 		IEEE80211_NOTE(vap, IEEE80211_MSG_ASSOC, ni,
 		    "send station disassociate (reason: %d (%s))", arg,
 		    ieee80211_reason_to_string(arg));
 		m = ieee80211_getmgtframe(&frm,
 			ic->ic_headroom + sizeof(struct ieee80211_frame),
 			sizeof(uint16_t));
 		if (m == NULL)
 			senderr(ENOMEM, is_tx_nobuf);
 		*(uint16_t *)frm = htole16(arg);	/* reason */
 		m->m_pkthdr.len = m->m_len = sizeof(uint16_t);
 
 		IEEE80211_NODE_STAT(ni, tx_disassoc);
 		IEEE80211_NODE_STAT_SET(ni, tx_disassoc_code, arg);
 		break;
 
 	default:
 		IEEE80211_NOTE(vap, IEEE80211_MSG_ANY, ni,
 		    "invalid mgmt frame type %u", type);
 		senderr(EINVAL, is_tx_unknownmgt);
 		/* NOTREACHED */
 	}
 
 	/* NB: force non-ProbeResp frames to the highest queue */
 	params.ibp_pri = WME_AC_VO;
 	params.ibp_rate0 = bss->ni_txparms->mgmtrate;
 	/* NB: we know all frames are unicast */
 	params.ibp_try0 = bss->ni_txparms->maxretry;
 	params.ibp_power = bss->ni_txpower;
 	return ieee80211_mgmt_output(ni, m, type, &params);
 bad:
 	ieee80211_free_node(ni);
 	return ret;
 #undef senderr
 #undef HTFLAGS
 }
 
 /*
  * Return an mbuf with a probe response frame in it.
  * Space is left to prepend and 802.11 header at the
  * front but it's left to the caller to fill in.
  */
 struct mbuf *
 ieee80211_alloc_proberesp(struct ieee80211_node *bss, int legacy)
 {
 	struct ieee80211vap *vap = bss->ni_vap;
 	struct ieee80211com *ic = bss->ni_ic;
 	const struct ieee80211_rateset *rs;
 	struct mbuf *m;
 	uint16_t capinfo;
 	uint8_t *frm;
 
 	/*
 	 * probe response frame format
 	 *	[8] time stamp
 	 *	[2] beacon interval
 	 *	[2] cabability information
 	 *	[tlv] ssid
 	 *	[tlv] supported rates
 	 *	[tlv] parameter set (FH/DS)
 	 *	[tlv] parameter set (IBSS)
 	 *	[tlv] country (optional)
 	 *	[3] power control (optional)
 	 *	[5] channel switch announcement (CSA) (optional)
 	 *	[tlv] extended rate phy (ERP)
 	 *	[tlv] extended supported rates
 	 *	[tlv] RSN (optional)
 	 *	[tlv] HT capabilities
 	 *	[tlv] HT information
 	 *	[tlv] VHT capabilities
 	 *	[tlv] VHT information
 	 *	[tlv] WPA (optional)
 	 *	[tlv] WME (optional)
 	 *	[tlv] Vendor OUI HT capabilities (optional)
 	 *	[tlv] Vendor OUI HT information (optional)
 	 *	[tlv] Atheros capabilities
 	 *	[tlv] AppIE's (optional)
 	 *	[tlv] Mesh ID (MBSS)
 	 *	[tlv] Mesh Conf (MBSS)
 	 */
 	m = ieee80211_getmgtframe(&frm,
 		 ic->ic_headroom + sizeof(struct ieee80211_frame),
 		 8
 	       + sizeof(uint16_t)
 	       + sizeof(uint16_t)
 	       + 2 + IEEE80211_NWID_LEN
 	       + 2 + IEEE80211_RATE_SIZE
 	       + 7	/* max(7,3) */
 	       + IEEE80211_COUNTRY_MAX_SIZE
 	       + 3
 	       + sizeof(struct ieee80211_csa_ie)
 	       + sizeof(struct ieee80211_quiet_ie)
 	       + 3
 	       + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 	       + sizeof(struct ieee80211_ie_wpa)
 	       + sizeof(struct ieee80211_ie_htcap)
 	       + sizeof(struct ieee80211_ie_htinfo)
 	       + sizeof(struct ieee80211_ie_wpa)
 	       + sizeof(struct ieee80211_wme_param)
 	       + 4 + sizeof(struct ieee80211_ie_htcap)
 	       + 4 + sizeof(struct ieee80211_ie_htinfo)
 	       +  sizeof(struct ieee80211_ie_vhtcap)
 	       +  sizeof(struct ieee80211_ie_vht_operation)
 #ifdef IEEE80211_SUPPORT_SUPERG
 	       + sizeof(struct ieee80211_ath_ie)
 #endif
 #ifdef IEEE80211_SUPPORT_MESH
 	       + 2 + IEEE80211_MESHID_LEN
 	       + sizeof(struct ieee80211_meshconf_ie)
 #endif
 	       + (vap->iv_appie_proberesp != NULL ?
 			vap->iv_appie_proberesp->ie_len : 0)
 	);
 	if (m == NULL) {
 		vap->iv_stats.is_tx_nobuf++;
 		return NULL;
 	}
 
 	memset(frm, 0, 8);	/* timestamp should be filled later */
 	frm += 8;
 	*(uint16_t *)frm = htole16(bss->ni_intval);
 	frm += 2;
 	capinfo = ieee80211_getcapinfo(vap, bss->ni_chan);
 	*(uint16_t *)frm = htole16(capinfo);
 	frm += 2;
 
 	frm = ieee80211_add_ssid(frm, bss->ni_essid, bss->ni_esslen);
 	rs = ieee80211_get_suprates(ic, bss->ni_chan);
 	frm = ieee80211_add_rates(frm, rs);
 
 	if (IEEE80211_IS_CHAN_FHSS(bss->ni_chan)) {
 		*frm++ = IEEE80211_ELEMID_FHPARMS;
 		*frm++ = 5;
 		*frm++ = bss->ni_fhdwell & 0x00ff;
 		*frm++ = (bss->ni_fhdwell >> 8) & 0x00ff;
 		*frm++ = IEEE80211_FH_CHANSET(
 		    ieee80211_chan2ieee(ic, bss->ni_chan));
 		*frm++ = IEEE80211_FH_CHANPAT(
 		    ieee80211_chan2ieee(ic, bss->ni_chan));
 		*frm++ = bss->ni_fhindex;
 	} else {
 		*frm++ = IEEE80211_ELEMID_DSPARMS;
 		*frm++ = 1;
 		*frm++ = ieee80211_chan2ieee(ic, bss->ni_chan);
 	}
 
 	if (vap->iv_opmode == IEEE80211_M_IBSS) {
 		*frm++ = IEEE80211_ELEMID_IBSSPARMS;
 		*frm++ = 2;
 		*frm++ = 0; *frm++ = 0;		/* TODO: ATIM window */
 	}
 	if ((vap->iv_flags & IEEE80211_F_DOTH) ||
 	    (vap->iv_flags_ext & IEEE80211_FEXT_DOTD))
 		frm = ieee80211_add_countryie(frm, ic);
 	if (vap->iv_flags & IEEE80211_F_DOTH) {
 		if (IEEE80211_IS_CHAN_5GHZ(bss->ni_chan))
 			frm = ieee80211_add_powerconstraint(frm, vap);
 		if (ic->ic_flags & IEEE80211_F_CSAPENDING)
 			frm = ieee80211_add_csa(frm, vap);
 	}
 	if (vap->iv_flags & IEEE80211_F_DOTH) {
 		if (IEEE80211_IS_CHAN_DFS(ic->ic_bsschan) &&
 		    (vap->iv_flags_ext & IEEE80211_FEXT_DFS)) {
 			if (vap->iv_quiet)
 				frm = ieee80211_add_quiet(frm, vap, 0);
 		}
 	}
 	if (IEEE80211_IS_CHAN_ANYG(bss->ni_chan))
 		frm = ieee80211_add_erp(frm, vap);
 	frm = ieee80211_add_xrates(frm, rs);
 	frm = ieee80211_add_rsn(frm, vap);
 	/*
 	 * NB: legacy 11b clients do not get certain ie's.
 	 *     The caller identifies such clients by passing
 	 *     a token in legacy to us.  Could expand this to be
 	 *     any legacy client for stuff like HT ie's.
 	 */
 	if (IEEE80211_IS_CHAN_HT(bss->ni_chan) &&
 	    legacy != IEEE80211_SEND_LEGACY_11B) {
 		frm = ieee80211_add_htcap(frm, bss);
 		frm = ieee80211_add_htinfo(frm, bss);
 	}
 	if (IEEE80211_IS_CHAN_VHT(bss->ni_chan) &&
 	    legacy != IEEE80211_SEND_LEGACY_11B) {
 		frm = ieee80211_add_vhtcap(frm, bss);
 		frm = ieee80211_add_vhtinfo(frm, bss);
 	}
 	frm = ieee80211_add_wpa(frm, vap);
 	if (vap->iv_flags & IEEE80211_F_WME)
 		frm = ieee80211_add_wme_param(frm, &ic->ic_wme,
 		    !! (vap->iv_flags_ext & IEEE80211_FEXT_UAPSD));
 	if (IEEE80211_IS_CHAN_HT(bss->ni_chan) &&
 	    (vap->iv_flags_ht & IEEE80211_FHT_HTCOMPAT) &&
 	    legacy != IEEE80211_SEND_LEGACY_11B) {
 		frm = ieee80211_add_htcap_vendor(frm, bss);
 		frm = ieee80211_add_htinfo_vendor(frm, bss);
 	}
 #ifdef IEEE80211_SUPPORT_SUPERG
 	if ((vap->iv_flags & IEEE80211_F_ATHEROS) &&
 	    legacy != IEEE80211_SEND_LEGACY_11B)
 		frm = ieee80211_add_athcaps(frm, bss);
 #endif
 	if (vap->iv_appie_proberesp != NULL)
 		frm = add_appie(frm, vap->iv_appie_proberesp);
 #ifdef IEEE80211_SUPPORT_MESH
 	if (vap->iv_opmode == IEEE80211_M_MBSS) {
 		frm = ieee80211_add_meshid(frm, vap);
 		frm = ieee80211_add_meshconf(frm, vap);
 	}
 #endif
 	m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 
 	return m;
 }
 
 /*
  * Send a probe response frame to the specified mac address.
  * This does not go through the normal mgt frame api so we
  * can specify the destination address and re-use the bss node
  * for the sta reference.
  */
 int
 ieee80211_send_proberesp(struct ieee80211vap *vap,
 	const uint8_t da[IEEE80211_ADDR_LEN], int legacy)
 {
 	struct ieee80211_node *bss = vap->iv_bss;
 	struct ieee80211com *ic = vap->iv_ic;
 	struct mbuf *m;
 	int ret;
 
 	if (vap->iv_state == IEEE80211_S_CAC) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_OUTPUT, bss,
 		    "block %s frame in CAC state", "probe response");
 		vap->iv_stats.is_tx_badstate++;
 		return EIO;		/* XXX */
 	}
 
 	/*
 	 * Hold a reference on the node so it doesn't go away until after
 	 * the xmit is complete all the way in the driver.  On error we
 	 * will remove our reference.
 	 */
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_NODE,
 	    "ieee80211_ref_node (%s:%u) %p<%s> refcnt %d\n",
 	    __func__, __LINE__, bss, ether_sprintf(bss->ni_macaddr),
 	    ieee80211_node_refcnt(bss)+1);
 	ieee80211_ref_node(bss);
 
 	m = ieee80211_alloc_proberesp(bss, legacy);
 	if (m == NULL) {
 		ieee80211_free_node(bss);
 		return ENOMEM;
 	}
 
 	M_PREPEND(m, sizeof(struct ieee80211_frame), IEEE80211_M_NOWAIT);
 	KASSERT(m != NULL, ("no room for header"));
 
 	IEEE80211_TX_LOCK(ic);
 	ieee80211_send_setup(bss, m,
 	     IEEE80211_FC0_TYPE_MGT | IEEE80211_FC0_SUBTYPE_PROBE_RESP,
 	     IEEE80211_NONQOS_TID, vap->iv_myaddr, da, bss->ni_bssid);
 	/* XXX power management? */
 	m->m_flags |= M_ENCAP;		/* mark encapsulated */
 
 	M_WME_SETAC(m, WME_AC_BE);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_DUMPPKTS,
 	    "send probe resp on channel %u to %s%s\n",
 	    ieee80211_chan2ieee(ic, ic->ic_curchan), ether_sprintf(da),
 	    legacy ? " <legacy>" : "");
 	IEEE80211_NODE_STAT(bss, tx_mgmt);
 
 	ret = ieee80211_raw_output(vap, bss, m, NULL);
 	IEEE80211_TX_UNLOCK(ic);
 	return (ret);
 }
 
 /*
  * Allocate and build a RTS (Request To Send) control frame.
  */
 struct mbuf *
 ieee80211_alloc_rts(struct ieee80211com *ic,
 	const uint8_t ra[IEEE80211_ADDR_LEN],
 	const uint8_t ta[IEEE80211_ADDR_LEN],
 	uint16_t dur)
 {
 	struct ieee80211_frame_rts *rts;
 	struct mbuf *m;
 
 	/* XXX honor ic_headroom */
 	m = m_gethdr(IEEE80211_M_NOWAIT, MT_DATA);
 	if (m != NULL) {
 		rts = mtod(m, struct ieee80211_frame_rts *);
 		rts->i_fc[0] = IEEE80211_FC0_VERSION_0 |
 			IEEE80211_FC0_TYPE_CTL | IEEE80211_FC0_SUBTYPE_RTS;
 		rts->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 		*(u_int16_t *)rts->i_dur = htole16(dur);
 		IEEE80211_ADDR_COPY(rts->i_ra, ra);
 		IEEE80211_ADDR_COPY(rts->i_ta, ta);
 
 		m->m_pkthdr.len = m->m_len = sizeof(struct ieee80211_frame_rts);
 	}
 	return m;
 }
 
 /*
  * Allocate and build a CTS (Clear To Send) control frame.
  */
 struct mbuf *
 ieee80211_alloc_cts(struct ieee80211com *ic,
 	const uint8_t ra[IEEE80211_ADDR_LEN], uint16_t dur)
 {
 	struct ieee80211_frame_cts *cts;
 	struct mbuf *m;
 
 	/* XXX honor ic_headroom */
 	m = m_gethdr(IEEE80211_M_NOWAIT, MT_DATA);
 	if (m != NULL) {
 		cts = mtod(m, struct ieee80211_frame_cts *);
 		cts->i_fc[0] = IEEE80211_FC0_VERSION_0 |
 			IEEE80211_FC0_TYPE_CTL | IEEE80211_FC0_SUBTYPE_CTS;
 		cts->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 		*(u_int16_t *)cts->i_dur = htole16(dur);
 		IEEE80211_ADDR_COPY(cts->i_ra, ra);
 
 		m->m_pkthdr.len = m->m_len = sizeof(struct ieee80211_frame_cts);
 	}
 	return m;
 }
 
 /*
  * Wrapper for CTS/RTS frame allocation.
  */
 struct mbuf *
 ieee80211_alloc_prot(struct ieee80211_node *ni, const struct mbuf *m,
     uint8_t rate, int prot)
 {
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211vap *vap = ni->ni_vap;
 	const struct ieee80211_frame *wh;
 	struct mbuf *mprot;
 	uint16_t dur;
 	int pktlen, isshort;
 
 	KASSERT(prot == IEEE80211_PROT_RTSCTS ||
 	    prot == IEEE80211_PROT_CTSONLY,
 	    ("wrong protection type %d", prot));
 
 	wh = mtod(m, const struct ieee80211_frame *);
 	pktlen = m->m_pkthdr.len + IEEE80211_CRC_LEN;
 	isshort = (vap->iv_flags & IEEE80211_F_SHPREAMBLE) != 0;
 	dur = ieee80211_compute_duration(ic->ic_rt, pktlen, rate, isshort)
 	    + ieee80211_ack_duration(ic->ic_rt, rate, isshort);
 
 	if (prot == IEEE80211_PROT_RTSCTS) {
 		/* NB: CTS is the same size as an ACK */
 		dur += ieee80211_ack_duration(ic->ic_rt, rate, isshort);
 		mprot = ieee80211_alloc_rts(ic, wh->i_addr1, wh->i_addr2, dur);
 	} else
 		mprot = ieee80211_alloc_cts(ic, vap->iv_myaddr, dur);
 
 	return (mprot);
 }
 
 static void
 ieee80211_tx_mgt_timeout(void *arg)
 {
 	struct ieee80211vap *vap = arg;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE | IEEE80211_MSG_DEBUG,
 	    "vap %p mode %s state %s flags %#x & %#x\n", vap,
 	    ieee80211_opmode_name[vap->iv_opmode],
 	    ieee80211_state_name[vap->iv_state],
 	    vap->iv_ic->ic_flags, IEEE80211_F_SCAN);
 
 	IEEE80211_LOCK(vap->iv_ic);
 	if (vap->iv_state != IEEE80211_S_INIT &&
 	    (vap->iv_ic->ic_flags & IEEE80211_F_SCAN) == 0) {
 		/*
 		 * NB: it's safe to specify a timeout as the reason here;
 		 *     it'll only be used in the right state.
 		 */
 		ieee80211_new_state_locked(vap, IEEE80211_S_SCAN,
 			IEEE80211_SCAN_FAIL_TIMEOUT);
 	}
 	IEEE80211_UNLOCK(vap->iv_ic);
 }
 
 /*
  * This is the callback set on net80211-sourced transmitted
  * authentication request frames.
  *
  * This does a couple of things:
  *
  * + If the frame transmitted was a success, it schedules a future
  *   event which will transition the interface to scan.
  *   If a state transition _then_ occurs before that event occurs,
  *   said state transition will cancel this callout.
  *
  * + If the frame transmit was a failure, it immediately schedules
  *   the transition back to scan.
  */
 static void
 ieee80211_tx_mgt_cb(struct ieee80211_node *ni, void *arg, int status)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	enum ieee80211_state ostate = (enum ieee80211_state)(uintptr_t)arg;
 
 	/*
 	 * Frame transmit completed; arrange timer callback.  If
 	 * transmit was successfully we wait for response.  Otherwise
 	 * we arrange an immediate callback instead of doing the
 	 * callback directly since we don't know what state the driver
 	 * is in (e.g. what locks it is holding).  This work should
 	 * not be too time-critical and not happen too often so the
 	 * added overhead is acceptable.
 	 *
 	 * XXX what happens if !acked but response shows up before callback?
 	 */
 	if (vap->iv_state == ostate) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE | IEEE80211_MSG_DEBUG,
 		    "ni %p mode %s state %s arg %p status %d\n", ni,
 		    ieee80211_opmode_name[vap->iv_opmode],
 		    ieee80211_state_name[vap->iv_state], arg, status);
 
 		callout_reset(&vap->iv_mgtsend,
 			status == 0 ? IEEE80211_TRANS_WAIT*hz : 0,
 			ieee80211_tx_mgt_timeout, vap);
 	}
 }
 
 static void
 ieee80211_beacon_construct(struct mbuf *m, uint8_t *frm,
 	struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_beacon_offsets *bo = &vap->iv_bcn_off;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_rateset *rs = &ni->ni_rates;
 	uint16_t capinfo;
 
 	/*
 	 * beacon frame format
 	 *
 	 * TODO: update to 802.11-2012; a lot of stuff has changed;
 	 * vendor extensions should be at the end, etc.
 	 *
 	 *	[8] time stamp
 	 *	[2] beacon interval
 	 *	[2] cabability information
 	 *	[tlv] ssid
 	 *	[tlv] supported rates
 	 *	[3] parameter set (DS)
 	 *	[8] CF parameter set (optional)
 	 *	[tlv] parameter set (IBSS/TIM)
 	 *	[tlv] country (optional)
 	 *	[3] power control (optional)
 	 *	[5] channel switch announcement (CSA) (optional)
 	 * XXX TODO: Quiet
 	 * XXX TODO: IBSS DFS
 	 * XXX TODO: TPC report
 	 *	[tlv] extended rate phy (ERP)
 	 *	[tlv] extended supported rates
 	 *	[tlv] RSN parameters
 	 * XXX TODO: BSSLOAD
 	 * (XXX EDCA parameter set, QoS capability?)
 	 * XXX TODO: AP channel report
 	 *
 	 *	[tlv] HT capabilities
 	 *	[tlv] HT information
 	 *	XXX TODO: 20/40 BSS coexistence
 	 * Mesh:
 	 * XXX TODO: Meshid
 	 * XXX TODO: mesh config
 	 * XXX TODO: mesh awake window
 	 * XXX TODO: beacon timing (mesh, etc)
 	 * XXX TODO: MCCAOP Advertisement Overview
 	 * XXX TODO: MCCAOP Advertisement
 	 * XXX TODO: Mesh channel switch parameters
 	 * VHT:
 	 * XXX TODO: VHT capabilities
 	 * XXX TODO: VHT operation
 	 * XXX TODO: VHT transmit power envelope
 	 * XXX TODO: channel switch wrapper element
 	 * XXX TODO: extended BSS load element
 	 *
 	 * XXX Vendor-specific OIDs (e.g. Atheros)
 	 *	[tlv] WPA parameters
 	 *	[tlv] WME parameters
 	 *	[tlv] Vendor OUI HT capabilities (optional)
 	 *	[tlv] Vendor OUI HT information (optional)
 	 *	[tlv] Atheros capabilities (optional)
 	 *	[tlv] TDMA parameters (optional)
 	 *	[tlv] Mesh ID (MBSS)
 	 *	[tlv] Mesh Conf (MBSS)
 	 *	[tlv] application data (optional)
 	 */
 
 	memset(bo, 0, sizeof(*bo));
 
 	memset(frm, 0, 8);	/* XXX timestamp is set by hardware/driver */
 	frm += 8;
 	*(uint16_t *)frm = htole16(ni->ni_intval);
 	frm += 2;
 	capinfo = ieee80211_getcapinfo(vap, ni->ni_chan);
 	bo->bo_caps = (uint16_t *)frm;
 	*(uint16_t *)frm = htole16(capinfo);
 	frm += 2;
 	*frm++ = IEEE80211_ELEMID_SSID;
 	if ((vap->iv_flags & IEEE80211_F_HIDESSID) == 0) {
 		*frm++ = ni->ni_esslen;
 		memcpy(frm, ni->ni_essid, ni->ni_esslen);
 		frm += ni->ni_esslen;
 	} else
 		*frm++ = 0;
 	frm = ieee80211_add_rates(frm, rs);
 	if (!IEEE80211_IS_CHAN_FHSS(ni->ni_chan)) {
 		*frm++ = IEEE80211_ELEMID_DSPARMS;
 		*frm++ = 1;
 		*frm++ = ieee80211_chan2ieee(ic, ni->ni_chan);
 	}
 	if (ic->ic_flags & IEEE80211_F_PCF) {
 		bo->bo_cfp = frm;
 		frm = ieee80211_add_cfparms(frm, ic);
 	}
 	bo->bo_tim = frm;
 	if (vap->iv_opmode == IEEE80211_M_IBSS) {
 		*frm++ = IEEE80211_ELEMID_IBSSPARMS;
 		*frm++ = 2;
 		*frm++ = 0; *frm++ = 0;		/* TODO: ATIM window */
 		bo->bo_tim_len = 0;
 	} else if (vap->iv_opmode == IEEE80211_M_HOSTAP ||
 	    vap->iv_opmode == IEEE80211_M_MBSS) {
 		/* TIM IE is the same for Mesh and Hostap */
 		struct ieee80211_tim_ie *tie = (struct ieee80211_tim_ie *) frm;
 
 		tie->tim_ie = IEEE80211_ELEMID_TIM;
 		tie->tim_len = 4;	/* length */
 		tie->tim_count = 0;	/* DTIM count */ 
 		tie->tim_period = vap->iv_dtim_period;	/* DTIM period */
 		tie->tim_bitctl = 0;	/* bitmap control */
 		tie->tim_bitmap[0] = 0;	/* Partial Virtual Bitmap */
 		frm += sizeof(struct ieee80211_tim_ie);
 		bo->bo_tim_len = 1;
 	}
 	bo->bo_tim_trailer = frm;
 	if ((vap->iv_flags & IEEE80211_F_DOTH) ||
 	    (vap->iv_flags_ext & IEEE80211_FEXT_DOTD))
 		frm = ieee80211_add_countryie(frm, ic);
 	if (vap->iv_flags & IEEE80211_F_DOTH) {
 		if (IEEE80211_IS_CHAN_5GHZ(ni->ni_chan))
 			frm = ieee80211_add_powerconstraint(frm, vap);
 		bo->bo_csa = frm;
 		if (ic->ic_flags & IEEE80211_F_CSAPENDING)
 			frm = ieee80211_add_csa(frm, vap);	
 	} else
 		bo->bo_csa = frm;
 
 	bo->bo_quiet = NULL;
 	if (vap->iv_flags & IEEE80211_F_DOTH) {
 		if (IEEE80211_IS_CHAN_DFS(ic->ic_bsschan) &&
 		    (vap->iv_flags_ext & IEEE80211_FEXT_DFS) &&
 		    (vap->iv_quiet == 1)) {
 			/*
 			 * We only insert the quiet IE offset if
 			 * the quiet IE is enabled.  Otherwise don't
 			 * put it here or we'll just overwrite
 			 * some other beacon contents.
 			 */
 			if (vap->iv_quiet) {
 				bo->bo_quiet = frm;
 				frm = ieee80211_add_quiet(frm,vap, 0);
 			}
 		}
 	}
 
 	if (IEEE80211_IS_CHAN_ANYG(ni->ni_chan)) {
 		bo->bo_erp = frm;
 		frm = ieee80211_add_erp(frm, vap);
 	}
 	frm = ieee80211_add_xrates(frm, rs);
 	frm = ieee80211_add_rsn(frm, vap);
 	if (IEEE80211_IS_CHAN_HT(ni->ni_chan)) {
 		frm = ieee80211_add_htcap(frm, ni);
 		bo->bo_htinfo = frm;
 		frm = ieee80211_add_htinfo(frm, ni);
 	}
 
 	if (IEEE80211_IS_CHAN_VHT(ni->ni_chan)) {
 		frm = ieee80211_add_vhtcap(frm, ni);
 		bo->bo_vhtinfo = frm;
 		frm = ieee80211_add_vhtinfo(frm, ni);
 		/* Transmit power envelope */
 		/* Channel switch wrapper element */
 		/* Extended bss load element */
 	}
 
 	frm = ieee80211_add_wpa(frm, vap);
 	if (vap->iv_flags & IEEE80211_F_WME) {
 		bo->bo_wme = frm;
 		frm = ieee80211_add_wme_param(frm, &ic->ic_wme,
 		    !! (vap->iv_flags_ext & IEEE80211_FEXT_UAPSD));
 	}
 	if (IEEE80211_IS_CHAN_HT(ni->ni_chan) &&
 	    (vap->iv_flags_ht & IEEE80211_FHT_HTCOMPAT)) {
 		frm = ieee80211_add_htcap_vendor(frm, ni);
 		frm = ieee80211_add_htinfo_vendor(frm, ni);
 	}
 
 #ifdef IEEE80211_SUPPORT_SUPERG
 	if (vap->iv_flags & IEEE80211_F_ATHEROS) {
 		bo->bo_ath = frm;
 		frm = ieee80211_add_athcaps(frm, ni);
 	}
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 	if (vap->iv_caps & IEEE80211_C_TDMA) {
 		bo->bo_tdma = frm;
 		frm = ieee80211_add_tdma(frm, vap);
 	}
 #endif
 	if (vap->iv_appie_beacon != NULL) {
 		bo->bo_appie = frm;
 		bo->bo_appie_len = vap->iv_appie_beacon->ie_len;
 		frm = add_appie(frm, vap->iv_appie_beacon);
 	}
 
 	/* XXX TODO: move meshid/meshconf up to before vendor extensions? */
 #ifdef IEEE80211_SUPPORT_MESH
 	if (vap->iv_opmode == IEEE80211_M_MBSS) {
 		frm = ieee80211_add_meshid(frm, vap);
 		bo->bo_meshconf = frm;
 		frm = ieee80211_add_meshconf(frm, vap);
 	}
 #endif
 	bo->bo_tim_trailer_len = frm - bo->bo_tim_trailer;
 	bo->bo_csa_trailer_len = frm - bo->bo_csa;
 	m->m_pkthdr.len = m->m_len = frm - mtod(m, uint8_t *);
 }
 
 /*
  * Allocate a beacon frame and fillin the appropriate bits.
  */
 struct mbuf *
 ieee80211_beacon_alloc(struct ieee80211_node *ni)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_frame *wh;
 	struct mbuf *m;
 	int pktlen;
 	uint8_t *frm;
 
 	/*
 	 * Update the "We're putting the quiet IE in the beacon" state.
 	 */
 	if (vap->iv_quiet == 1)
 		vap->iv_flags_ext |= IEEE80211_FEXT_QUIET_IE;
 	else if (vap->iv_quiet == 0)
 		vap->iv_flags_ext &= ~IEEE80211_FEXT_QUIET_IE;
 
 	/*
 	 * beacon frame format
 	 *
 	 * Note: This needs updating for 802.11-2012.
 	 *
 	 *	[8] time stamp
 	 *	[2] beacon interval
 	 *	[2] cabability information
 	 *	[tlv] ssid
 	 *	[tlv] supported rates
 	 *	[3] parameter set (DS)
 	 *	[8] CF parameter set (optional)
 	 *	[tlv] parameter set (IBSS/TIM)
 	 *	[tlv] country (optional)
 	 *	[3] power control (optional)
 	 *	[5] channel switch announcement (CSA) (optional)
 	 *	[tlv] extended rate phy (ERP)
 	 *	[tlv] extended supported rates
 	 *	[tlv] RSN parameters
 	 *	[tlv] HT capabilities
 	 *	[tlv] HT information
 	 *	[tlv] VHT capabilities
 	 *	[tlv] VHT operation
 	 *	[tlv] Vendor OUI HT capabilities (optional)
 	 *	[tlv] Vendor OUI HT information (optional)
 	 * XXX Vendor-specific OIDs (e.g. Atheros)
 	 *	[tlv] WPA parameters
 	 *	[tlv] WME parameters
 	 *	[tlv] TDMA parameters (optional)
 	 *	[tlv] Mesh ID (MBSS)
 	 *	[tlv] Mesh Conf (MBSS)
 	 *	[tlv] application data (optional)
 	 * NB: we allocate the max space required for the TIM bitmap.
 	 * XXX how big is this?
 	 */
 	pktlen =   8					/* time stamp */
 		 + sizeof(uint16_t)			/* beacon interval */
 		 + sizeof(uint16_t)			/* capabilities */
 		 + 2 + ni->ni_esslen			/* ssid */
 	         + 2 + IEEE80211_RATE_SIZE		/* supported rates */
 	         + 2 + 1				/* DS parameters */
 		 + 2 + 6				/* CF parameters */
 		 + 2 + 4 + vap->iv_tim_len		/* DTIM/IBSSPARMS */
 		 + IEEE80211_COUNTRY_MAX_SIZE		/* country */
 		 + 2 + 1				/* power control */
 		 + sizeof(struct ieee80211_csa_ie)	/* CSA */
 		 + sizeof(struct ieee80211_quiet_ie)	/* Quiet */
 		 + 2 + 1				/* ERP */
 	         + 2 + (IEEE80211_RATE_MAXSIZE - IEEE80211_RATE_SIZE)
 		 + (vap->iv_caps & IEEE80211_C_WPA ?	/* WPA 1+2 */
 			2*sizeof(struct ieee80211_ie_wpa) : 0)
 		 /* XXX conditional? */
 		 + 4+2*sizeof(struct ieee80211_ie_htcap)/* HT caps */
 		 + 4+2*sizeof(struct ieee80211_ie_htinfo)/* HT info */
 		 + sizeof(struct ieee80211_ie_vhtcap)/* VHT caps */
 		 + sizeof(struct ieee80211_ie_vht_operation)/* VHT info */
 		 + (vap->iv_caps & IEEE80211_C_WME ?	/* WME */
 			sizeof(struct ieee80211_wme_param) : 0)
 #ifdef IEEE80211_SUPPORT_SUPERG
 		 + sizeof(struct ieee80211_ath_ie)	/* ATH */
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 		 + (vap->iv_caps & IEEE80211_C_TDMA ?	/* TDMA */
 			sizeof(struct ieee80211_tdma_param) : 0)
 #endif
 #ifdef IEEE80211_SUPPORT_MESH
 		 + 2 + ni->ni_meshidlen
 		 + sizeof(struct ieee80211_meshconf_ie)
 #endif
 		 + IEEE80211_MAX_APPIE
 		 ;
 	m = ieee80211_getmgtframe(&frm,
 		ic->ic_headroom + sizeof(struct ieee80211_frame), pktlen);
 	if (m == NULL) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_ANY,
 			"%s: cannot get buf; size %u\n", __func__, pktlen);
 		vap->iv_stats.is_tx_nobuf++;
 		return NULL;
 	}
 	ieee80211_beacon_construct(m, frm, ni);
 
 	M_PREPEND(m, sizeof(struct ieee80211_frame), IEEE80211_M_NOWAIT);
 	KASSERT(m != NULL, ("no space for 802.11 header?"));
 	wh = mtod(m, struct ieee80211_frame *);
 	wh->i_fc[0] = IEEE80211_FC0_VERSION_0 | IEEE80211_FC0_TYPE_MGT |
 	    IEEE80211_FC0_SUBTYPE_BEACON;
 	wh->i_fc[1] = IEEE80211_FC1_DIR_NODS;
 	*(uint16_t *)wh->i_dur = 0;
 	IEEE80211_ADDR_COPY(wh->i_addr1, ifp->if_broadcastaddr);
 	IEEE80211_ADDR_COPY(wh->i_addr2, vap->iv_myaddr);
 	IEEE80211_ADDR_COPY(wh->i_addr3, ni->ni_bssid);
 	*(uint16_t *)wh->i_seq = 0;
 
 	return m;
 }
 
 /*
  * Update the dynamic parts of a beacon frame based on the current state.
  */
 int
 ieee80211_beacon_update(struct ieee80211_node *ni, struct mbuf *m, int mcast)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211_beacon_offsets *bo = &vap->iv_bcn_off;
 	struct ieee80211com *ic = ni->ni_ic;
 	int len_changed = 0;
 	uint16_t capinfo;
 	struct ieee80211_frame *wh;
 	ieee80211_seq seqno;
 
 	IEEE80211_LOCK(ic);
 	/*
 	 * Handle 11h channel change when we've reached the count.
 	 * We must recalculate the beacon frame contents to account
 	 * for the new channel.  Note we do this only for the first
 	 * vap that reaches this point; subsequent vaps just update
 	 * their beacon state to reflect the recalculated channel.
 	 */
 	if (isset(bo->bo_flags, IEEE80211_BEACON_CSA) &&
 	    vap->iv_csa_count == ic->ic_csa_count) {
 		vap->iv_csa_count = 0;
 		/*
 		 * Effect channel change before reconstructing the beacon
 		 * frame contents as many places reference ni_chan.
 		 */
 		if (ic->ic_csa_newchan != NULL)
 			ieee80211_csa_completeswitch(ic);
 		/*
 		 * NB: ieee80211_beacon_construct clears all pending
 		 * updates in bo_flags so we don't need to explicitly
 		 * clear IEEE80211_BEACON_CSA.
 		 */
 		ieee80211_beacon_construct(m,
 		    mtod(m, uint8_t*) + sizeof(struct ieee80211_frame), ni);
 
 		/* XXX do WME aggressive mode processing? */
 		IEEE80211_UNLOCK(ic);
 		return 1;		/* just assume length changed */
 	}
 
 	/*
 	 * Handle the quiet time element being added and removed.
 	 * Again, for now we just cheat and reconstruct the whole
 	 * beacon - that way the gap is provided as appropriate.
 	 *
 	 * So, track whether we have already added the IE versus
 	 * whether we want to be adding the IE.
 	 */
 	if ((vap->iv_flags_ext & IEEE80211_FEXT_QUIET_IE) &&
 	    (vap->iv_quiet == 0)) {
 		/*
 		 * Quiet time beacon IE enabled, but it's disabled;
 		 * recalc
 		 */
 		vap->iv_flags_ext &= ~IEEE80211_FEXT_QUIET_IE;
 		ieee80211_beacon_construct(m,
 		    mtod(m, uint8_t*) + sizeof(struct ieee80211_frame), ni);
 		/* XXX do WME aggressive mode processing? */
 		IEEE80211_UNLOCK(ic);
 		return 1;		/* just assume length changed */
 	}
 
 	if (((vap->iv_flags_ext & IEEE80211_FEXT_QUIET_IE) == 0) &&
 	    (vap->iv_quiet == 1)) {
 		/*
 		 * Quiet time beacon IE disabled, but it's now enabled;
 		 * recalc
 		 */
 		vap->iv_flags_ext |= IEEE80211_FEXT_QUIET_IE;
 		ieee80211_beacon_construct(m,
 		    mtod(m, uint8_t*) + sizeof(struct ieee80211_frame), ni);
 		/* XXX do WME aggressive mode processing? */
 		IEEE80211_UNLOCK(ic);
 		return 1;		/* just assume length changed */
 	}
 
 	wh = mtod(m, struct ieee80211_frame *);
 
 	/*
 	 * XXX TODO Strictly speaking this should be incremented with the TX
 	 * lock held so as to serialise access to the non-qos TID sequence
 	 * number space.
 	 *
 	 * If the driver identifies it does its own TX seqno management then
 	 * we can skip this (and still not do the TX seqno.)
 	 */
 	seqno = ni->ni_txseqs[IEEE80211_NONQOS_TID]++;
 	*(uint16_t *)&wh->i_seq[0] =
 		htole16(seqno << IEEE80211_SEQ_SEQ_SHIFT);
 	M_SEQNO_SET(m, seqno);
 
 	/* XXX faster to recalculate entirely or just changes? */
 	capinfo = ieee80211_getcapinfo(vap, ni->ni_chan);
 	*bo->bo_caps = htole16(capinfo);
 
 	if (vap->iv_flags & IEEE80211_F_WME) {
 		struct ieee80211_wme_state *wme = &ic->ic_wme;
 
 		/*
 		 * Check for aggressive mode change.  When there is
 		 * significant high priority traffic in the BSS
 		 * throttle back BE traffic by using conservative
 		 * parameters.  Otherwise BE uses aggressive params
 		 * to optimize performance of legacy/non-QoS traffic.
 		 */
 		if (wme->wme_flags & WME_F_AGGRMODE) {
 			if (wme->wme_hipri_traffic >
 			    wme->wme_hipri_switch_thresh) {
 				IEEE80211_DPRINTF(vap, IEEE80211_MSG_WME,
 				    "%s: traffic %u, disable aggressive mode\n",
 				    __func__, wme->wme_hipri_traffic);
 				wme->wme_flags &= ~WME_F_AGGRMODE;
 				ieee80211_wme_updateparams_locked(vap);
 				wme->wme_hipri_traffic =
 					wme->wme_hipri_switch_hysteresis;
 			} else
 				wme->wme_hipri_traffic = 0;
 		} else {
 			if (wme->wme_hipri_traffic <=
 			    wme->wme_hipri_switch_thresh) {
 				IEEE80211_DPRINTF(vap, IEEE80211_MSG_WME,
 				    "%s: traffic %u, enable aggressive mode\n",
 				    __func__, wme->wme_hipri_traffic);
 				wme->wme_flags |= WME_F_AGGRMODE;
 				ieee80211_wme_updateparams_locked(vap);
 				wme->wme_hipri_traffic = 0;
 			} else
 				wme->wme_hipri_traffic =
 					wme->wme_hipri_switch_hysteresis;
 		}
 		if (isset(bo->bo_flags, IEEE80211_BEACON_WME)) {
 			(void) ieee80211_add_wme_param(bo->bo_wme, wme,
 			  vap->iv_flags_ext & IEEE80211_FEXT_UAPSD);
 			clrbit(bo->bo_flags, IEEE80211_BEACON_WME);
 		}
 	}
 
 	if (isset(bo->bo_flags,  IEEE80211_BEACON_HTINFO)) {
 		ieee80211_ht_update_beacon(vap, bo);
 		clrbit(bo->bo_flags, IEEE80211_BEACON_HTINFO);
 	}
 #ifdef IEEE80211_SUPPORT_TDMA
 	if (vap->iv_caps & IEEE80211_C_TDMA) {
 		/*
 		 * NB: the beacon is potentially updated every TBTT.
 		 */
 		ieee80211_tdma_update_beacon(vap, bo);
 	}
 #endif
 #ifdef IEEE80211_SUPPORT_MESH
 	if (vap->iv_opmode == IEEE80211_M_MBSS)
 		ieee80211_mesh_update_beacon(vap, bo);
 #endif
 
 	if (vap->iv_opmode == IEEE80211_M_HOSTAP ||
 	    vap->iv_opmode == IEEE80211_M_MBSS) {	/* NB: no IBSS support*/
 		struct ieee80211_tim_ie *tie =
 			(struct ieee80211_tim_ie *) bo->bo_tim;
 		if (isset(bo->bo_flags, IEEE80211_BEACON_TIM)) {
 			u_int timlen, timoff, i;
 			/* 
 			 * ATIM/DTIM needs updating.  If it fits in the
 			 * current space allocated then just copy in the
 			 * new bits.  Otherwise we need to move any trailing
 			 * data to make room.  Note that we know there is
 			 * contiguous space because ieee80211_beacon_allocate
 			 * insures there is space in the mbuf to write a
 			 * maximal-size virtual bitmap (based on iv_max_aid).
 			 */
 			/*
 			 * Calculate the bitmap size and offset, copy any
 			 * trailer out of the way, and then copy in the
 			 * new bitmap and update the information element.
 			 * Note that the tim bitmap must contain at least
 			 * one byte and any offset must be even.
 			 */
 			if (vap->iv_ps_pending != 0) {
 				timoff = 128;		/* impossibly large */
 				for (i = 0; i < vap->iv_tim_len; i++)
 					if (vap->iv_tim_bitmap[i]) {
 						timoff = i &~ 1;
 						break;
 					}
 				KASSERT(timoff != 128, ("tim bitmap empty!"));
 				for (i = vap->iv_tim_len-1; i >= timoff; i--)
 					if (vap->iv_tim_bitmap[i])
 						break;
 				timlen = 1 + (i - timoff);
 			} else {
 				timoff = 0;
 				timlen = 1;
 			}
 
 			/*
 			 * TODO: validate this!
 			 */
 			if (timlen != bo->bo_tim_len) {
 				/* copy up/down trailer */
 				int adjust = tie->tim_bitmap+timlen
 					   - bo->bo_tim_trailer;
 				ovbcopy(bo->bo_tim_trailer,
 				    bo->bo_tim_trailer+adjust,
 				    bo->bo_tim_trailer_len);
 				bo->bo_tim_trailer += adjust;
 				bo->bo_erp += adjust;
 				bo->bo_htinfo += adjust;
 				bo->bo_vhtinfo += adjust;
 #ifdef IEEE80211_SUPPORT_SUPERG
 				bo->bo_ath += adjust;
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 				bo->bo_tdma += adjust;
 #endif
 #ifdef IEEE80211_SUPPORT_MESH
 				bo->bo_meshconf += adjust;
 #endif
 				bo->bo_appie += adjust;
 				bo->bo_wme += adjust;
 				bo->bo_csa += adjust;
 				bo->bo_quiet += adjust;
 				bo->bo_tim_len = timlen;
 
 				/* update information element */
 				tie->tim_len = 3 + timlen;
 				tie->tim_bitctl = timoff;
 				len_changed = 1;
 			}
 			memcpy(tie->tim_bitmap, vap->iv_tim_bitmap + timoff,
 				bo->bo_tim_len);
 
 			clrbit(bo->bo_flags, IEEE80211_BEACON_TIM);
 
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_POWER,
 				"%s: TIM updated, pending %u, off %u, len %u\n",
 				__func__, vap->iv_ps_pending, timoff, timlen);
 		}
 		/* count down DTIM period */
 		if (tie->tim_count == 0)
 			tie->tim_count = tie->tim_period - 1;
 		else
 			tie->tim_count--;
 		/* update state for buffered multicast frames on DTIM */
 		if (mcast && tie->tim_count == 0)
 			tie->tim_bitctl |= 1;
 		else
 			tie->tim_bitctl &= ~1;
 		if (isset(bo->bo_flags, IEEE80211_BEACON_CSA)) {
 			struct ieee80211_csa_ie *csa =
 			    (struct ieee80211_csa_ie *) bo->bo_csa;
 
 			/*
 			 * Insert or update CSA ie.  If we're just starting
 			 * to count down to the channel switch then we need
 			 * to insert the CSA ie.  Otherwise we just need to
 			 * drop the count.  The actual change happens above
 			 * when the vap's count reaches the target count.
 			 */
 			if (vap->iv_csa_count == 0) {
 				memmove(&csa[1], csa, bo->bo_csa_trailer_len);
 				bo->bo_erp += sizeof(*csa);
 				bo->bo_htinfo += sizeof(*csa);
 				bo->bo_vhtinfo += sizeof(*csa);
 				bo->bo_wme += sizeof(*csa);
 #ifdef IEEE80211_SUPPORT_SUPERG
 				bo->bo_ath += sizeof(*csa);
 #endif
 #ifdef IEEE80211_SUPPORT_TDMA
 				bo->bo_tdma += sizeof(*csa);
 #endif
 #ifdef IEEE80211_SUPPORT_MESH
 				bo->bo_meshconf += sizeof(*csa);
 #endif
 				bo->bo_appie += sizeof(*csa);
 				bo->bo_csa_trailer_len += sizeof(*csa);
 				bo->bo_quiet += sizeof(*csa);
 				bo->bo_tim_trailer_len += sizeof(*csa);
 				m->m_len += sizeof(*csa);
 				m->m_pkthdr.len += sizeof(*csa);
 
 				ieee80211_add_csa(bo->bo_csa, vap);
 			} else
 				csa->csa_count--;
 			vap->iv_csa_count++;
 			/* NB: don't clear IEEE80211_BEACON_CSA */
 		}
 
 		/*
 		 * Only add the quiet time IE if we've enabled it
 		 * as appropriate.
 		 */
 		if (IEEE80211_IS_CHAN_DFS(ic->ic_bsschan) &&
 		    (vap->iv_flags_ext & IEEE80211_FEXT_DFS)) {
 			if (vap->iv_quiet &&
 			    (vap->iv_flags_ext & IEEE80211_FEXT_QUIET_IE)) {
 				ieee80211_add_quiet(bo->bo_quiet, vap, 1);
 			}
 		}
 		if (isset(bo->bo_flags, IEEE80211_BEACON_ERP)) {
 			/*
 			 * ERP element needs updating.
 			 */
 			(void) ieee80211_add_erp(bo->bo_erp, vap);
 			clrbit(bo->bo_flags, IEEE80211_BEACON_ERP);
 		}
 #ifdef IEEE80211_SUPPORT_SUPERG
 		if (isset(bo->bo_flags,  IEEE80211_BEACON_ATH)) {
 			ieee80211_add_athcaps(bo->bo_ath, ni);
 			clrbit(bo->bo_flags, IEEE80211_BEACON_ATH);
 		}
 #endif
 	}
 	if (isset(bo->bo_flags, IEEE80211_BEACON_APPIE)) {
 		const struct ieee80211_appie *aie = vap->iv_appie_beacon;
 		int aielen;
 		uint8_t *frm;
 
 		aielen = 0;
 		if (aie != NULL)
 			aielen += aie->ie_len;
 		if (aielen != bo->bo_appie_len) {
 			/* copy up/down trailer */
 			int adjust = aielen - bo->bo_appie_len;
 			ovbcopy(bo->bo_tim_trailer, bo->bo_tim_trailer+adjust,
 				bo->bo_tim_trailer_len);
 			bo->bo_tim_trailer += adjust;
 			bo->bo_appie += adjust;
 			bo->bo_appie_len = aielen;
 
 			len_changed = 1;
 		}
 		frm = bo->bo_appie;
 		if (aie != NULL)
 			frm  = add_appie(frm, aie);
 		clrbit(bo->bo_flags, IEEE80211_BEACON_APPIE);
 	}
 	IEEE80211_UNLOCK(ic);
 
 	return len_changed;
 }
 
 /*
  * Do Ethernet-LLC encapsulation for each payload in a fast frame
  * tunnel encapsulation.  The frame is assumed to have an Ethernet
  * header at the front that must be stripped before prepending the
  * LLC followed by the Ethernet header passed in (with an Ethernet
  * type that specifies the payload size).
  */
 struct mbuf *
 ieee80211_ff_encap1(struct ieee80211vap *vap, struct mbuf *m,
 	const struct ether_header *eh)
 {
 	struct llc *llc;
 	uint16_t payload;
 
 	/* XXX optimize by combining m_adj+M_PREPEND */
 	m_adj(m, sizeof(struct ether_header) - sizeof(struct llc));
 	llc = mtod(m, struct llc *);
 	llc->llc_dsap = llc->llc_ssap = LLC_SNAP_LSAP;
 	llc->llc_control = LLC_UI;
 	llc->llc_snap.org_code[0] = 0;
 	llc->llc_snap.org_code[1] = 0;
 	llc->llc_snap.org_code[2] = 0;
 	llc->llc_snap.ether_type = eh->ether_type;
 	payload = m->m_pkthdr.len;		/* NB: w/o Ethernet header */
 
 	M_PREPEND(m, sizeof(struct ether_header), IEEE80211_M_NOWAIT);
 	if (m == NULL) {		/* XXX cannot happen */
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SUPERG,
 			"%s: no space for ether_header\n", __func__);
 		vap->iv_stats.is_tx_nobuf++;
 		return NULL;
 	}
 	ETHER_HEADER_COPY(mtod(m, void *), eh);
 	mtod(m, struct ether_header *)->ether_type = htons(payload);
 	return m;
 }
 
 /*
  * Complete an mbuf transmission.
  *
  * For now, this simply processes a completed frame after the
  * driver has completed it's transmission and/or retransmission.
  * It assumes the frame is an 802.11 encapsulated frame.
  *
  * Later on it will grow to become the exit path for a given frame
  * from the driver and, depending upon how it's been encapsulated
  * and already transmitted, it may end up doing A-MPDU retransmission,
  * power save requeuing, etc.
  *
  * In order for the above to work, the driver entry point to this
  * must not hold any driver locks.  Thus, the driver needs to delay
  * any actual mbuf completion until it can release said locks.
  *
  * This frees the mbuf and if the mbuf has a node reference,
  * the node reference will be freed.
  */
 void
 ieee80211_tx_complete(struct ieee80211_node *ni, struct mbuf *m, int status)
 {
 
 	if (ni != NULL) {
 		struct ifnet *ifp = ni->ni_vap->iv_ifp;
 
 		if (status == 0) {
 			if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			if (m->m_flags & M_MCAST)
 				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		} else
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 		if (m->m_flags & M_TXCB) {
 			IEEE80211_DPRINTF(ni->ni_vap, IEEE80211_MSG_STATE | IEEE80211_MSG_DEBUG,
 			   "ni %p vap %p mode %s state %s m %p status %d\n", ni, ni->ni_vap,
 			   ieee80211_opmode_name[ni->ni_vap->iv_opmode],
 			   ieee80211_state_name[ni->ni_vap->iv_state], m, status);
 			ieee80211_process_callback(ni, m, status);
 		}
 		ieee80211_free_node(ni);
 	}
 	m_freem(m);
 }
diff --git a/sys/net80211/ieee80211_proto.c b/sys/net80211/ieee80211_proto.c
index 7e76a3ae9226..ec94f2cf7275 100644
--- a/sys/net80211/ieee80211_proto.c
+++ b/sys/net80211/ieee80211_proto.c
@@ -1,2825 +1,2826 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001 Atsushi Onoe
  * Copyright (c) 2002-2008 Sam Leffler, Errno Consulting
  * Copyright (c) 2012 IEEE
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * IEEE 802.11 protocol support.
  */
 
 #include "opt_inet.h"
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 
 #include <sys/socket.h>
 #include <sys/sockio.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
+#include <net/if_private.h>
 #include <net/ethernet.h>		/* XXX for ether_sprintf */
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_adhoc.h>
 #include <net80211/ieee80211_sta.h>
 #include <net80211/ieee80211_hostap.h>
 #include <net80211/ieee80211_wds.h>
 #ifdef IEEE80211_SUPPORT_MESH
 #include <net80211/ieee80211_mesh.h>
 #endif
 #include <net80211/ieee80211_monitor.h>
 #include <net80211/ieee80211_input.h>
 
 /* XXX tunables */
 #define	AGGRESSIVE_MODE_SWITCH_HYSTERESIS	3	/* pkts / 100ms */
 #define	HIGH_PRI_SWITCH_THRESH			10	/* pkts / 100ms */
 
 const char *mgt_subtype_name[] = {
 	"assoc_req",	"assoc_resp",	"reassoc_req",	"reassoc_resp",
 	"probe_req",	"probe_resp",	"timing_adv",	"reserved#7",
 	"beacon",	"atim",		"disassoc",	"auth",
 	"deauth",	"action",	"action_noack",	"reserved#15"
 };
 const char *ctl_subtype_name[] = {
 	"reserved#0",	"reserved#1",	"reserved#2",	"reserved#3",
 	"reserved#4",	"reserved#5",	"reserved#6",	"control_wrap",
 	"bar",		"ba",		"ps_poll",	"rts",
 	"cts",		"ack",		"cf_end",	"cf_end_ack"
 };
 const char *ieee80211_opmode_name[IEEE80211_OPMODE_MAX] = {
 	"IBSS",		/* IEEE80211_M_IBSS */
 	"STA",		/* IEEE80211_M_STA */
 	"WDS",		/* IEEE80211_M_WDS */
 	"AHDEMO",	/* IEEE80211_M_AHDEMO */
 	"HOSTAP",	/* IEEE80211_M_HOSTAP */
 	"MONITOR",	/* IEEE80211_M_MONITOR */
 	"MBSS"		/* IEEE80211_M_MBSS */
 };
 const char *ieee80211_state_name[IEEE80211_S_MAX] = {
 	"INIT",		/* IEEE80211_S_INIT */
 	"SCAN",		/* IEEE80211_S_SCAN */
 	"AUTH",		/* IEEE80211_S_AUTH */
 	"ASSOC",	/* IEEE80211_S_ASSOC */
 	"CAC",		/* IEEE80211_S_CAC */
 	"RUN",		/* IEEE80211_S_RUN */
 	"CSA",		/* IEEE80211_S_CSA */
 	"SLEEP",	/* IEEE80211_S_SLEEP */
 };
 const char *ieee80211_wme_acnames[] = {
 	"WME_AC_BE",
 	"WME_AC_BK",
 	"WME_AC_VI",
 	"WME_AC_VO",
 	"WME_UPSD",
 };
 
 /*
  * Reason code descriptions were (mostly) obtained from
  * IEEE Std 802.11-2012, pp. 442-445 Table 8-36.
  */
 const char *
 ieee80211_reason_to_string(uint16_t reason)
 {
 	switch (reason) {
 	case IEEE80211_REASON_UNSPECIFIED:
 		return ("unspecified");
 	case IEEE80211_REASON_AUTH_EXPIRE:
 		return ("previous authentication is expired");
 	case IEEE80211_REASON_AUTH_LEAVE:
 		return ("sending STA is leaving/has left IBSS or ESS");
 	case IEEE80211_REASON_ASSOC_EXPIRE:
 		return ("disassociated due to inactivity");
 	case IEEE80211_REASON_ASSOC_TOOMANY:
 		return ("too many associated STAs");
 	case IEEE80211_REASON_NOT_AUTHED:
 		return ("class 2 frame received from nonauthenticated STA");
 	case IEEE80211_REASON_NOT_ASSOCED:
 		return ("class 3 frame received from nonassociated STA");
 	case IEEE80211_REASON_ASSOC_LEAVE:
 		return ("sending STA is leaving/has left BSS");
 	case IEEE80211_REASON_ASSOC_NOT_AUTHED:
 		return ("STA requesting (re)association is not authenticated");
 	case IEEE80211_REASON_DISASSOC_PWRCAP_BAD:
 		return ("information in the Power Capability element is "
 			"unacceptable");
 	case IEEE80211_REASON_DISASSOC_SUPCHAN_BAD:
 		return ("information in the Supported Channels element is "
 			"unacceptable");
 	case IEEE80211_REASON_IE_INVALID:
 		return ("invalid element");
 	case IEEE80211_REASON_MIC_FAILURE:
 		return ("MIC failure");
 	case IEEE80211_REASON_4WAY_HANDSHAKE_TIMEOUT:
 		return ("4-Way handshake timeout");
 	case IEEE80211_REASON_GROUP_KEY_UPDATE_TIMEOUT:
 		return ("group key update timeout");
 	case IEEE80211_REASON_IE_IN_4WAY_DIFFERS:
 		return ("element in 4-Way handshake different from "
 			"(re)association request/probe response/beacon frame");
 	case IEEE80211_REASON_GROUP_CIPHER_INVALID:
 		return ("invalid group cipher");
 	case IEEE80211_REASON_PAIRWISE_CIPHER_INVALID:
 		return ("invalid pairwise cipher");
 	case IEEE80211_REASON_AKMP_INVALID:
 		return ("invalid AKMP");
 	case IEEE80211_REASON_UNSUPP_RSN_IE_VERSION:
 		return ("unsupported version in RSN IE");
 	case IEEE80211_REASON_INVALID_RSN_IE_CAP:
 		return ("invalid capabilities in RSN IE");
 	case IEEE80211_REASON_802_1X_AUTH_FAILED:
 		return ("IEEE 802.1X authentication failed");
 	case IEEE80211_REASON_CIPHER_SUITE_REJECTED:
 		return ("cipher suite rejected because of the security "
 			"policy");
 	case IEEE80211_REASON_UNSPECIFIED_QOS:
 		return ("unspecified (QoS-related)");
 	case IEEE80211_REASON_INSUFFICIENT_BW:
 		return ("QoS AP lacks sufficient bandwidth for this QoS STA");
 	case IEEE80211_REASON_TOOMANY_FRAMES:
 		return ("too many frames need to be acknowledged");
 	case IEEE80211_REASON_OUTSIDE_TXOP:
 		return ("STA is transmitting outside the limits of its TXOPs");
 	case IEEE80211_REASON_LEAVING_QBSS:
 		return ("requested from peer STA (the STA is "
 			"resetting/leaving the BSS)");
 	case IEEE80211_REASON_BAD_MECHANISM:
 		return ("requested from peer STA (it does not want to use "
 			"the mechanism)");
 	case IEEE80211_REASON_SETUP_NEEDED:
 		return ("requested from peer STA (setup is required for the "
 			"used mechanism)");
 	case IEEE80211_REASON_TIMEOUT:
 		return ("requested from peer STA (timeout)");
 	case IEEE80211_REASON_PEER_LINK_CANCELED:
 		return ("SME cancels the mesh peering instance (not related "
 			"to the maximum number of peer mesh STAs)");
 	case IEEE80211_REASON_MESH_MAX_PEERS:
 		return ("maximum number of peer mesh STAs was reached");
 	case IEEE80211_REASON_MESH_CPVIOLATION:
 		return ("the received information violates the Mesh "
 			"Configuration policy configured in the mesh STA "
 			"profile");
 	case IEEE80211_REASON_MESH_CLOSE_RCVD:
 		return ("the mesh STA has received a Mesh Peering Close "
 			"message requesting to close the mesh peering");
 	case IEEE80211_REASON_MESH_MAX_RETRIES:
 		return ("the mesh STA has resent dot11MeshMaxRetries Mesh "
 			"Peering Open messages, without receiving a Mesh "
 			"Peering Confirm message");
 	case IEEE80211_REASON_MESH_CONFIRM_TIMEOUT:
 		return ("the confirmTimer for the mesh peering instance times "
 			"out");
 	case IEEE80211_REASON_MESH_INVALID_GTK:
 		return ("the mesh STA fails to unwrap the GTK or the values "
 			"in the wrapped contents do not match");
 	case IEEE80211_REASON_MESH_INCONS_PARAMS:
 		return ("the mesh STA receives inconsistent information about "
 			"the mesh parameters between Mesh Peering Management "
 			"frames");
 	case IEEE80211_REASON_MESH_INVALID_SECURITY:
 		return ("the mesh STA fails the authenticated mesh peering "
 			"exchange because due to failure in selecting "
 			"pairwise/group ciphersuite");
 	case IEEE80211_REASON_MESH_PERR_NO_PROXY:
 		return ("the mesh STA does not have proxy information for "
 			"this external destination");
 	case IEEE80211_REASON_MESH_PERR_NO_FI:
 		return ("the mesh STA does not have forwarding information "
 			"for this destination");
 	case IEEE80211_REASON_MESH_PERR_DEST_UNREACH:
 		return ("the mesh STA determines that the link to the next "
 			"hop of an active path in its forwarding information "
 			"is no longer usable");
 	case IEEE80211_REASON_MESH_MAC_ALRDY_EXISTS_MBSS:
 		return ("the MAC address of the STA already exists in the "
 			"mesh BSS");
 	case IEEE80211_REASON_MESH_CHAN_SWITCH_REG:
 		return ("the mesh STA performs channel switch to meet "
 			"regulatory requirements");
 	case IEEE80211_REASON_MESH_CHAN_SWITCH_UNSPEC:
 		return ("the mesh STA performs channel switch with "
 			"unspecified reason");
 	default:
 		return ("reserved/unknown");
 	}
 }
 
 static void beacon_miss(void *, int);
 static void beacon_swmiss(void *, int);
 static void parent_updown(void *, int);
 static void update_mcast(void *, int);
 static void update_promisc(void *, int);
 static void update_channel(void *, int);
 static void update_chw(void *, int);
 static void vap_update_wme(void *, int);
 static void vap_update_slot(void *, int);
 static void restart_vaps(void *, int);
 static void vap_update_erp_protmode(void *, int);
 static void vap_update_preamble(void *, int);
 static void vap_update_ht_protmode(void *, int);
 static void ieee80211_newstate_cb(void *, int);
 static struct ieee80211_node *vap_update_bss(struct ieee80211vap *,
     struct ieee80211_node *);
 
 static int
 null_raw_xmit(struct ieee80211_node *ni, struct mbuf *m,
 	const struct ieee80211_bpf_params *params)
 {
 
 	ic_printf(ni->ni_ic, "missing ic_raw_xmit callback, drop frame\n");
 	m_freem(m);
 	return ENETDOWN;
 }
 
 void
 ieee80211_proto_attach(struct ieee80211com *ic)
 {
 	uint8_t hdrlen;
 
 	/* override the 802.3 setting */
 	hdrlen = ic->ic_headroom
 		+ sizeof(struct ieee80211_qosframe_addr4)
 		+ IEEE80211_WEP_IVLEN + IEEE80211_WEP_KIDLEN
 		+ IEEE80211_WEP_EXTIVLEN;
 	/* XXX no way to recalculate on ifdetach */
 	max_linkhdr_grow(ALIGN(hdrlen));
 	//ic->ic_protmode = IEEE80211_PROT_CTSONLY;
 
 	TASK_INIT(&ic->ic_parent_task, 0, parent_updown, ic);
 	TASK_INIT(&ic->ic_mcast_task, 0, update_mcast, ic);
 	TASK_INIT(&ic->ic_promisc_task, 0, update_promisc, ic);
 	TASK_INIT(&ic->ic_chan_task, 0, update_channel, ic);
 	TASK_INIT(&ic->ic_bmiss_task, 0, beacon_miss, ic);
 	TASK_INIT(&ic->ic_chw_task, 0, update_chw, ic);
 	TASK_INIT(&ic->ic_restart_task, 0, restart_vaps, ic);
 
 	ic->ic_wme.wme_hipri_switch_hysteresis =
 		AGGRESSIVE_MODE_SWITCH_HYSTERESIS;
 
 	/* initialize management frame handlers */
 	ic->ic_send_mgmt = ieee80211_send_mgmt;
 	ic->ic_raw_xmit = null_raw_xmit;
 
 	ieee80211_adhoc_attach(ic);
 	ieee80211_sta_attach(ic);
 	ieee80211_wds_attach(ic);
 	ieee80211_hostap_attach(ic);
 #ifdef IEEE80211_SUPPORT_MESH
 	ieee80211_mesh_attach(ic);
 #endif
 	ieee80211_monitor_attach(ic);
 }
 
 void
 ieee80211_proto_detach(struct ieee80211com *ic)
 {
 	ieee80211_monitor_detach(ic);
 #ifdef IEEE80211_SUPPORT_MESH
 	ieee80211_mesh_detach(ic);
 #endif
 	ieee80211_hostap_detach(ic);
 	ieee80211_wds_detach(ic);
 	ieee80211_adhoc_detach(ic);
 	ieee80211_sta_detach(ic);
 }
 
 static void
 null_update_beacon(struct ieee80211vap *vap, int item)
 {
 }
 
 void
 ieee80211_proto_vattach(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	int i;
 
 	/* override the 802.3 setting */
 	ifp->if_hdrlen = ic->ic_headroom
                 + sizeof(struct ieee80211_qosframe_addr4)
                 + IEEE80211_WEP_IVLEN + IEEE80211_WEP_KIDLEN
                 + IEEE80211_WEP_EXTIVLEN;
 
 	vap->iv_rtsthreshold = IEEE80211_RTS_DEFAULT;
 	vap->iv_fragthreshold = IEEE80211_FRAG_DEFAULT;
 	vap->iv_bmiss_max = IEEE80211_BMISS_MAX;
 	callout_init_mtx(&vap->iv_swbmiss, IEEE80211_LOCK_OBJ(ic), 0);
 	callout_init(&vap->iv_mgtsend, 1);
 	TASK_INIT(&vap->iv_nstate_task, 0, ieee80211_newstate_cb, vap);
 	TASK_INIT(&vap->iv_swbmiss_task, 0, beacon_swmiss, vap);
 	TASK_INIT(&vap->iv_wme_task, 0, vap_update_wme, vap);
 	TASK_INIT(&vap->iv_slot_task, 0, vap_update_slot, vap);
 	TASK_INIT(&vap->iv_erp_protmode_task, 0, vap_update_erp_protmode, vap);
 	TASK_INIT(&vap->iv_ht_protmode_task, 0, vap_update_ht_protmode, vap);
 	TASK_INIT(&vap->iv_preamble_task, 0, vap_update_preamble, vap);
 	/*
 	 * Install default tx rate handling: no fixed rate, lowest
 	 * supported rate for mgmt and multicast frames.  Default
 	 * max retry count.  These settings can be changed by the
 	 * driver and/or user applications.
 	 */
 	for (i = IEEE80211_MODE_11A; i < IEEE80211_MODE_MAX; i++) {
 		if (isclr(ic->ic_modecaps, i))
 			continue;
 
 		const struct ieee80211_rateset *rs = &ic->ic_sup_rates[i];
 
 		vap->iv_txparms[i].ucastrate = IEEE80211_FIXED_RATE_NONE;
 
 		/*
 		 * Setting the management rate to MCS 0 assumes that the
 		 * BSS Basic rate set is empty and the BSS Basic MCS set
 		 * is not.
 		 *
 		 * Since we're not checking this, default to the lowest
 		 * defined rate for this mode.
 		 *
 		 * At least one 11n AP (DLINK DIR-825) is reported to drop
 		 * some MCS management traffic (eg BA response frames.)
 		 *
 		 * See also: 9.6.0 of the 802.11n-2009 specification.
 		 */
 #ifdef	NOTYET
 		if (i == IEEE80211_MODE_11NA || i == IEEE80211_MODE_11NG) {
 			vap->iv_txparms[i].mgmtrate = 0 | IEEE80211_RATE_MCS;
 			vap->iv_txparms[i].mcastrate = 0 | IEEE80211_RATE_MCS;
 		} else {
 			vap->iv_txparms[i].mgmtrate =
 			    rs->rs_rates[0] & IEEE80211_RATE_VAL;
 			vap->iv_txparms[i].mcastrate = 
 			    rs->rs_rates[0] & IEEE80211_RATE_VAL;
 		}
 #endif
 		vap->iv_txparms[i].mgmtrate = rs->rs_rates[0] & IEEE80211_RATE_VAL;
 		vap->iv_txparms[i].mcastrate = rs->rs_rates[0] & IEEE80211_RATE_VAL;
 		vap->iv_txparms[i].maxretry = IEEE80211_TXMAX_DEFAULT;
 	}
 	vap->iv_roaming = IEEE80211_ROAMING_AUTO;
 
 	vap->iv_update_beacon = null_update_beacon;
 	vap->iv_deliver_data = ieee80211_deliver_data;
 	vap->iv_protmode = IEEE80211_PROT_CTSONLY;
 	vap->iv_update_bss = vap_update_bss;
 
 	/* attach support for operating mode */
 	ic->ic_vattach[vap->iv_opmode](vap);
 }
 
 void
 ieee80211_proto_vdetach(struct ieee80211vap *vap)
 {
 #define	FREEAPPIE(ie) do { \
 	if (ie != NULL) \
 		IEEE80211_FREE(ie, M_80211_NODE_IE); \
 } while (0)
 	/*
 	 * Detach operating mode module.
 	 */
 	if (vap->iv_opdetach != NULL)
 		vap->iv_opdetach(vap);
 	/*
 	 * This should not be needed as we detach when reseting
 	 * the state but be conservative here since the
 	 * authenticator may do things like spawn kernel threads.
 	 */
 	if (vap->iv_auth->ia_detach != NULL)
 		vap->iv_auth->ia_detach(vap);
 	/*
 	 * Detach any ACL'ator.
 	 */
 	if (vap->iv_acl != NULL)
 		vap->iv_acl->iac_detach(vap);
 
 	FREEAPPIE(vap->iv_appie_beacon);
 	FREEAPPIE(vap->iv_appie_probereq);
 	FREEAPPIE(vap->iv_appie_proberesp);
 	FREEAPPIE(vap->iv_appie_assocreq);
 	FREEAPPIE(vap->iv_appie_assocresp);
 	FREEAPPIE(vap->iv_appie_wpa);
 #undef FREEAPPIE
 }
 
 /*
  * Simple-minded authenticator module support.
  */
 
 #define	IEEE80211_AUTH_MAX	(IEEE80211_AUTH_WPA+1)
 /* XXX well-known names */
 static const char *auth_modnames[IEEE80211_AUTH_MAX] = {
 	"wlan_internal",	/* IEEE80211_AUTH_NONE */
 	"wlan_internal",	/* IEEE80211_AUTH_OPEN */
 	"wlan_internal",	/* IEEE80211_AUTH_SHARED */
 	"wlan_xauth",		/* IEEE80211_AUTH_8021X	 */
 	"wlan_internal",	/* IEEE80211_AUTH_AUTO */
 	"wlan_xauth",		/* IEEE80211_AUTH_WPA */
 };
 static const struct ieee80211_authenticator *authenticators[IEEE80211_AUTH_MAX];
 
 static const struct ieee80211_authenticator auth_internal = {
 	.ia_name		= "wlan_internal",
 	.ia_attach		= NULL,
 	.ia_detach		= NULL,
 	.ia_node_join		= NULL,
 	.ia_node_leave		= NULL,
 };
 
 /*
  * Setup internal authenticators once; they are never unregistered.
  */
 static void
 ieee80211_auth_setup(void)
 {
 	ieee80211_authenticator_register(IEEE80211_AUTH_OPEN, &auth_internal);
 	ieee80211_authenticator_register(IEEE80211_AUTH_SHARED, &auth_internal);
 	ieee80211_authenticator_register(IEEE80211_AUTH_AUTO, &auth_internal);
 }
 SYSINIT(wlan_auth, SI_SUB_DRIVERS, SI_ORDER_FIRST, ieee80211_auth_setup, NULL);
 
 const struct ieee80211_authenticator *
 ieee80211_authenticator_get(int auth)
 {
 	if (auth >= IEEE80211_AUTH_MAX)
 		return NULL;
 	if (authenticators[auth] == NULL)
 		ieee80211_load_module(auth_modnames[auth]);
 	return authenticators[auth];
 }
 
 void
 ieee80211_authenticator_register(int type,
 	const struct ieee80211_authenticator *auth)
 {
 	if (type >= IEEE80211_AUTH_MAX)
 		return;
 	authenticators[type] = auth;
 }
 
 void
 ieee80211_authenticator_unregister(int type)
 {
 
 	if (type >= IEEE80211_AUTH_MAX)
 		return;
 	authenticators[type] = NULL;
 }
 
 /*
  * Very simple-minded ACL module support.
  */
 /* XXX just one for now */
 static	const struct ieee80211_aclator *acl = NULL;
 
 void
 ieee80211_aclator_register(const struct ieee80211_aclator *iac)
 {
 	printf("wlan: %s acl policy registered\n", iac->iac_name);
 	acl = iac;
 }
 
 void
 ieee80211_aclator_unregister(const struct ieee80211_aclator *iac)
 {
 	if (acl == iac)
 		acl = NULL;
 	printf("wlan: %s acl policy unregistered\n", iac->iac_name);
 }
 
 const struct ieee80211_aclator *
 ieee80211_aclator_get(const char *name)
 {
 	if (acl == NULL)
 		ieee80211_load_module("wlan_acl");
 	return acl != NULL && strcmp(acl->iac_name, name) == 0 ? acl : NULL;
 }
 
 void
 ieee80211_print_essid(const uint8_t *essid, int len)
 {
 	const uint8_t *p;
 	int i;
 
 	if (len > IEEE80211_NWID_LEN)
 		len = IEEE80211_NWID_LEN;
 	/* determine printable or not */
 	for (i = 0, p = essid; i < len; i++, p++) {
 		if (*p < ' ' || *p > 0x7e)
 			break;
 	}
 	if (i == len) {
 		printf("\"");
 		for (i = 0, p = essid; i < len; i++, p++)
 			printf("%c", *p);
 		printf("\"");
 	} else {
 		printf("0x");
 		for (i = 0, p = essid; i < len; i++, p++)
 			printf("%02x", *p);
 	}
 }
 
 void
 ieee80211_dump_pkt(struct ieee80211com *ic,
 	const uint8_t *buf, int len, int rate, int rssi)
 {
 	const struct ieee80211_frame *wh;
 	int i;
 
 	wh = (const struct ieee80211_frame *)buf;
 	switch (wh->i_fc[1] & IEEE80211_FC1_DIR_MASK) {
 	case IEEE80211_FC1_DIR_NODS:
 		printf("NODS %s", ether_sprintf(wh->i_addr2));
 		printf("->%s", ether_sprintf(wh->i_addr1));
 		printf("(%s)", ether_sprintf(wh->i_addr3));
 		break;
 	case IEEE80211_FC1_DIR_TODS:
 		printf("TODS %s", ether_sprintf(wh->i_addr2));
 		printf("->%s", ether_sprintf(wh->i_addr3));
 		printf("(%s)", ether_sprintf(wh->i_addr1));
 		break;
 	case IEEE80211_FC1_DIR_FROMDS:
 		printf("FRDS %s", ether_sprintf(wh->i_addr3));
 		printf("->%s", ether_sprintf(wh->i_addr1));
 		printf("(%s)", ether_sprintf(wh->i_addr2));
 		break;
 	case IEEE80211_FC1_DIR_DSTODS:
 		printf("DSDS %s", ether_sprintf((const uint8_t *)&wh[1]));
 		printf("->%s", ether_sprintf(wh->i_addr3));
 		printf("(%s", ether_sprintf(wh->i_addr2));
 		printf("->%s)", ether_sprintf(wh->i_addr1));
 		break;
 	}
 	switch (wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK) {
 	case IEEE80211_FC0_TYPE_DATA:
 		printf(" data");
 		break;
 	case IEEE80211_FC0_TYPE_MGT:
 		printf(" %s", ieee80211_mgt_subtype_name(wh->i_fc[0]));
 		break;
 	default:
 		printf(" type#%d", wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK);
 		break;
 	}
 	if (IEEE80211_QOS_HAS_SEQ(wh)) {
 		const struct ieee80211_qosframe *qwh = 
 			(const struct ieee80211_qosframe *)buf;
 		printf(" QoS [TID %u%s]", qwh->i_qos[0] & IEEE80211_QOS_TID,
 			qwh->i_qos[0] & IEEE80211_QOS_ACKPOLICY ? " ACM" : "");
 	}
 	if (IEEE80211_IS_PROTECTED(wh)) {
 		int off;
 
 		off = ieee80211_anyhdrspace(ic, wh);
 		printf(" WEP [IV %.02x %.02x %.02x",
 			buf[off+0], buf[off+1], buf[off+2]);
 		if (buf[off+IEEE80211_WEP_IVLEN] & IEEE80211_WEP_EXTIV)
 			printf(" %.02x %.02x %.02x",
 				buf[off+4], buf[off+5], buf[off+6]);
 		printf(" KID %u]", buf[off+IEEE80211_WEP_IVLEN] >> 6);
 	}
 	if (rate >= 0)
 		printf(" %dM", rate / 2);
 	if (rssi >= 0)
 		printf(" +%d", rssi);
 	printf("\n");
 	if (len > 0) {
 		for (i = 0; i < len; i++) {
 			if ((i & 1) == 0)
 				printf(" ");
 			printf("%02x", buf[i]);
 		}
 		printf("\n");
 	}
 }
 
 static __inline int
 findrix(const struct ieee80211_rateset *rs, int r)
 {
 	int i;
 
 	for (i = 0; i < rs->rs_nrates; i++)
 		if ((rs->rs_rates[i] & IEEE80211_RATE_VAL) == r)
 			return i;
 	return -1;
 }
 
 int
 ieee80211_fix_rate(struct ieee80211_node *ni,
 	struct ieee80211_rateset *nrs, int flags)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	int i, j, rix, error;
 	int okrate, badrate, fixedrate, ucastrate;
 	const struct ieee80211_rateset *srs;
 	uint8_t r;
 
 	error = 0;
 	okrate = badrate = 0;
 	ucastrate = vap->iv_txparms[ieee80211_chan2mode(ni->ni_chan)].ucastrate;
 	if (ucastrate != IEEE80211_FIXED_RATE_NONE) {
 		/*
 		 * Workaround awkwardness with fixed rate.  We are called
 		 * to check both the legacy rate set and the HT rate set
 		 * but we must apply any legacy fixed rate check only to the
 		 * legacy rate set and vice versa.  We cannot tell what type
 		 * of rate set we've been given (legacy or HT) but we can
 		 * distinguish the fixed rate type (MCS have 0x80 set).
 		 * So to deal with this the caller communicates whether to
 		 * check MCS or legacy rate using the flags and we use the
 		 * type of any fixed rate to avoid applying an MCS to a
 		 * legacy rate and vice versa.
 		 */
 		if (ucastrate & 0x80) {
 			if (flags & IEEE80211_F_DOFRATE)
 				flags &= ~IEEE80211_F_DOFRATE;
 		} else if ((ucastrate & 0x80) == 0) {
 			if (flags & IEEE80211_F_DOFMCS)
 				flags &= ~IEEE80211_F_DOFMCS;
 		}
 		/* NB: required to make MCS match below work */
 		ucastrate &= IEEE80211_RATE_VAL;
 	}
 	fixedrate = IEEE80211_FIXED_RATE_NONE;
 	/*
 	 * XXX we are called to process both MCS and legacy rates;
 	 * we must use the appropriate basic rate set or chaos will
 	 * ensue; for now callers that want MCS must supply
 	 * IEEE80211_F_DOBRS; at some point we'll need to split this
 	 * function so there are two variants, one for MCS and one
 	 * for legacy rates.
 	 */
 	if (flags & IEEE80211_F_DOBRS)
 		srs = (const struct ieee80211_rateset *)
 		    ieee80211_get_suphtrates(ic, ni->ni_chan);
 	else
 		srs = ieee80211_get_suprates(ic, ni->ni_chan);
 	for (i = 0; i < nrs->rs_nrates; ) {
 		if (flags & IEEE80211_F_DOSORT) {
 			/*
 			 * Sort rates.
 			 */
 			for (j = i + 1; j < nrs->rs_nrates; j++) {
 				if (IEEE80211_RV(nrs->rs_rates[i]) >
 				    IEEE80211_RV(nrs->rs_rates[j])) {
 					r = nrs->rs_rates[i];
 					nrs->rs_rates[i] = nrs->rs_rates[j];
 					nrs->rs_rates[j] = r;
 				}
 			}
 		}
 		r = nrs->rs_rates[i] & IEEE80211_RATE_VAL;
 		badrate = r;
 		/*
 		 * Check for fixed rate.
 		 */
 		if (r == ucastrate)
 			fixedrate = r;
 		/*
 		 * Check against supported rates.
 		 */
 		rix = findrix(srs, r);
 		if (flags & IEEE80211_F_DONEGO) {
 			if (rix < 0) {
 				/*
 				 * A rate in the node's rate set is not
 				 * supported.  If this is a basic rate and we
 				 * are operating as a STA then this is an error.
 				 * Otherwise we just discard/ignore the rate.
 				 */
 				if ((flags & IEEE80211_F_JOIN) &&
 				    (nrs->rs_rates[i] & IEEE80211_RATE_BASIC))
 					error++;
 			} else if ((flags & IEEE80211_F_JOIN) == 0) {
 				/*
 				 * Overwrite with the supported rate
 				 * value so any basic rate bit is set.
 				 */
 				nrs->rs_rates[i] = srs->rs_rates[rix];
 			}
 		}
 		if ((flags & IEEE80211_F_DODEL) && rix < 0) {
 			/*
 			 * Delete unacceptable rates.
 			 */
 			nrs->rs_nrates--;
 			for (j = i; j < nrs->rs_nrates; j++)
 				nrs->rs_rates[j] = nrs->rs_rates[j + 1];
 			nrs->rs_rates[j] = 0;
 			continue;
 		}
 		if (rix >= 0)
 			okrate = nrs->rs_rates[i];
 		i++;
 	}
 	if (okrate == 0 || error != 0 ||
 	    ((flags & (IEEE80211_F_DOFRATE|IEEE80211_F_DOFMCS)) &&
 	     fixedrate != ucastrate)) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_XRATE | IEEE80211_MSG_11N, ni,
 		    "%s: flags 0x%x okrate %d error %d fixedrate 0x%x "
 		    "ucastrate %x\n", __func__, fixedrate, ucastrate, flags);
 		return badrate | IEEE80211_RATE_BASIC;
 	} else
 		return IEEE80211_RV(okrate);
 }
 
 /*
  * Reset 11g-related state.
  *
  * This is for per-VAP ERP/11g state.
  *
  * Eventually everything in ieee80211_reset_erp() will be
  * per-VAP and in here.
  */
 void
 ieee80211_vap_reset_erp(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	vap->iv_nonerpsta = 0;
 	vap->iv_longslotsta = 0;
 
 	vap->iv_flags &= ~IEEE80211_F_USEPROT;
 	/*
 	 * Set short preamble and ERP barker-preamble flags.
 	 */
 	if (IEEE80211_IS_CHAN_A(ic->ic_curchan) ||
 	    (vap->iv_caps & IEEE80211_C_SHPREAMBLE)) {
 		vap->iv_flags |= IEEE80211_F_SHPREAMBLE;
 		vap->iv_flags &= ~IEEE80211_F_USEBARKER;
 	} else {
 		vap->iv_flags &= ~IEEE80211_F_SHPREAMBLE;
 		vap->iv_flags |= IEEE80211_F_USEBARKER;
 	}
 
 	/*
 	 * Short slot time is enabled only when operating in 11g
 	 * and not in an IBSS.  We must also honor whether or not
 	 * the driver is capable of doing it.
 	 */
 	ieee80211_vap_set_shortslottime(vap,
 		IEEE80211_IS_CHAN_A(ic->ic_curchan) ||
 		IEEE80211_IS_CHAN_HT(ic->ic_curchan) ||
 		(IEEE80211_IS_CHAN_ANYG(ic->ic_curchan) &&
 		vap->iv_opmode == IEEE80211_M_HOSTAP &&
 		(ic->ic_caps & IEEE80211_C_SHSLOT)));
 }
 
 /*
  * Reset 11g-related state.
  *
  * Note this resets the global state and a caller should schedule
  * a re-check of all the VAPs after setup to update said state.
  */
 void
 ieee80211_reset_erp(struct ieee80211com *ic)
 {
 #if 0
 	ic->ic_flags &= ~IEEE80211_F_USEPROT;
 	/*
 	 * Set short preamble and ERP barker-preamble flags.
 	 */
 	if (IEEE80211_IS_CHAN_A(ic->ic_curchan) ||
 	    (ic->ic_caps & IEEE80211_C_SHPREAMBLE)) {
 		ic->ic_flags |= IEEE80211_F_SHPREAMBLE;
 		ic->ic_flags &= ~IEEE80211_F_USEBARKER;
 	} else {
 		ic->ic_flags &= ~IEEE80211_F_SHPREAMBLE;
 		ic->ic_flags |= IEEE80211_F_USEBARKER;
 	}
 #endif
 	/* XXX TODO: schedule a new per-VAP ERP calculation */
 }
 
 static struct ieee80211_node *
 vap_update_bss(struct ieee80211vap *vap, struct ieee80211_node *ni)
 {
 	struct ieee80211_node *obss;
 
 	obss = vap->iv_bss;
 	vap->iv_bss = ni;
 
 	return (obss);
 }
 
 /*
  * Deferred slot time update.
  *
  * For per-VAP slot time configuration, call the VAP
  * method if the VAP requires it.  Otherwise, just call the
  * older global method.
  *
  * If the per-VAP method is called then it's expected that
  * the driver/firmware will take care of turning the per-VAP
  * flags into slot time configuration.
  *
  * If the per-VAP method is not called then the global flags will be
  * flipped into sync with the VAPs; ic_flags IEEE80211_F_SHSLOT will
  * be set only if all of the vaps will have it set.
  *
  * Look at the comments for vap_update_erp_protmode() for more
  * background; this assumes all VAPs are on the same channel.
  */
 static void
 vap_update_slot(void *arg, int npending)
 {
 	struct ieee80211vap *vap = arg;
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211vap *iv;
 	int num_shslot = 0, num_lgslot = 0;
 
 	/*
 	 * Per-VAP path - we've already had the flags updated;
 	 * so just notify the driver and move on.
 	 */
 	if (vap->iv_updateslot != NULL) {
 		vap->iv_updateslot(vap);
 		return;
 	}
 
 	/*
 	 * Iterate over all of the VAP flags to update the
 	 * global flag.
 	 *
 	 * If all vaps have short slot enabled then flip on
 	 * short slot.  If any vap has it disabled then
 	 * we leave it globally disabled.  This should provide
 	 * correct behaviour in a multi-BSS scenario where
 	 * at least one VAP has short slot disabled for some
 	 * reason.
 	 */
 	IEEE80211_LOCK(ic);
 	TAILQ_FOREACH(iv, &ic->ic_vaps, iv_next) {
 		if (iv->iv_flags & IEEE80211_F_SHSLOT)
 			num_shslot++;
 		else
 			num_lgslot++;
 	}
 
 	/*
 	 * It looks backwards but - if the number of short slot VAPs
 	 * is zero then we're not short slot.  Else, we have one
 	 * or more short slot VAPs and we're checking to see if ANY
 	 * of them have short slot disabled.
 	 */
 	if (num_shslot == 0)
 		ic->ic_flags &= ~IEEE80211_F_SHSLOT;
 	else if (num_lgslot == 0)
 		ic->ic_flags |= IEEE80211_F_SHSLOT;
 	IEEE80211_UNLOCK(ic);
 
 	/*
 	 * Call the driver with our new global slot time flags.
 	 */
 	if (ic->ic_updateslot != NULL)
 		ic->ic_updateslot(ic);
 }
 
 /*
  * Deferred ERP protmode update.
  *
  * This currently calculates the global ERP protection mode flag
  * based on each of the VAPs.  Any VAP with it enabled is enough
  * for the global flag to be enabled.  All VAPs with it disabled
  * is enough for it to be disabled.
  *
  * This may make sense right now for the supported hardware where
  * net80211 is controlling the single channel configuration, but
  * offload firmware that's doing channel changes (eg off-channel
  * TDLS, off-channel STA, off-channel P2P STA/AP) may get some
  * silly looking flag updates.
  *
  * Ideally the protection mode calculation is done based on the
  * channel, and all VAPs using that channel will inherit it.
  * But until that's what net80211 does, this wil have to do.
  */
 static void
 vap_update_erp_protmode(void *arg, int npending)
 {
 	struct ieee80211vap *vap = arg;
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211vap *iv;
 	int enable_protmode = 0;
 	int non_erp_present = 0;
 
 	/*
 	 * Iterate over all of the VAPs to calculate the overlapping
 	 * ERP protection mode configuration and ERP present math.
 	 *
 	 * For now we assume that if a driver can handle this per-VAP
 	 * then it'll ignore the ic->ic_protmode variant and instead
 	 * will look at the vap related flags.
 	 */
 	IEEE80211_LOCK(ic);
 	TAILQ_FOREACH(iv, &ic->ic_vaps, iv_next) {
 		if (iv->iv_flags & IEEE80211_F_USEPROT)
 			enable_protmode = 1;
 		if (iv->iv_flags_ext & IEEE80211_FEXT_NONERP_PR)
 			non_erp_present = 1;
 	}
 
 	if (enable_protmode)
 		ic->ic_flags |= IEEE80211_F_USEPROT;
 	else
 		ic->ic_flags &= ~IEEE80211_F_USEPROT;
 
 	if (non_erp_present)
 		ic->ic_flags_ext |= IEEE80211_FEXT_NONERP_PR;
 	else
 		ic->ic_flags_ext &= ~IEEE80211_FEXT_NONERP_PR;
 
 	/* Beacon update on all VAPs */
 	ieee80211_notify_erp_locked(ic);
 
 	IEEE80211_UNLOCK(ic);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG,
 	    "%s: called; enable_protmode=%d, non_erp_present=%d\n",
 	    __func__, enable_protmode, non_erp_present);
 
 	/*
 	 * Now that the global configuration flags are calculated,
 	 * notify the VAP about its configuration.
 	 *
 	 * The global flags will be used when assembling ERP IEs
 	 * for multi-VAP operation, even if it's on a different
 	 * channel.  Yes, that's going to need fixing in the
 	 * future.
 	 */
 	if (vap->iv_erp_protmode_update != NULL)
 		vap->iv_erp_protmode_update(vap);
 }
 
 /*
  * Deferred ERP short preamble/barker update.
  *
  * All VAPs need to use short preamble for it to be globally
  * enabled or not.
  *
  * Look at the comments for vap_update_erp_protmode() for more
  * background; this assumes all VAPs are on the same channel.
  */
 static void
 vap_update_preamble(void *arg, int npending)
 {
 	struct ieee80211vap *vap = arg;
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211vap *iv;
 	int barker_count = 0, short_preamble_count = 0, count = 0;
 
 	/*
 	 * Iterate over all of the VAPs to calculate the overlapping
 	 * short or long preamble configuration.
 	 *
 	 * For now we assume that if a driver can handle this per-VAP
 	 * then it'll ignore the ic->ic_flags variant and instead
 	 * will look at the vap related flags.
 	 */
 	IEEE80211_LOCK(ic);
 	TAILQ_FOREACH(iv, &ic->ic_vaps, iv_next) {
 		if (iv->iv_flags & IEEE80211_F_USEBARKER)
 			barker_count++;
 		if (iv->iv_flags & IEEE80211_F_SHPREAMBLE)
 			short_preamble_count++;
 		count++;
 	}
 
 	/*
 	 * As with vap_update_erp_protmode(), the global flags are
 	 * currently used for beacon IEs.
 	 */
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG,
 	    "%s: called; barker_count=%d, short_preamble_count=%d\n",
 	    __func__, barker_count, short_preamble_count);
 
 	/*
 	 * Only flip on short preamble if all of the VAPs support
 	 * it.
 	 */
 	if (barker_count == 0 && short_preamble_count == count) {
 		ic->ic_flags |= IEEE80211_F_SHPREAMBLE;
 		ic->ic_flags &= ~IEEE80211_F_USEBARKER;
 	} else {
 		ic->ic_flags &= ~IEEE80211_F_SHPREAMBLE;
 		ic->ic_flags |= IEEE80211_F_USEBARKER;
 	}
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG,
 	  "%s: global barker=%d preamble=%d\n",
 	  __func__,
 	  !! (ic->ic_flags & IEEE80211_F_USEBARKER),
 	  !! (ic->ic_flags & IEEE80211_F_SHPREAMBLE));
 
 	/* Beacon update on all VAPs */
 	ieee80211_notify_erp_locked(ic);
 
 	IEEE80211_UNLOCK(ic);
 
 	/* Driver notification */
 	if (vap->iv_erp_protmode_update != NULL)
 		vap->iv_preamble_update(vap);
 }
 
 /*
  * Deferred HT protmode update and beacon update.
  *
  * Look at the comments for vap_update_erp_protmode() for more
  * background; this assumes all VAPs are on the same channel.
  */
 static void
 vap_update_ht_protmode(void *arg, int npending)
 {
 	struct ieee80211vap *vap = arg;
 	struct ieee80211vap *iv;
 	struct ieee80211com *ic = vap->iv_ic;
 	int num_vaps = 0, num_pure = 0;
 	int num_optional = 0, num_ht2040 = 0, num_nonht = 0;
 	int num_ht_sta = 0, num_ht40_sta = 0, num_sta = 0;
 	int num_nonhtpr = 0;
 
 	/*
 	 * Iterate over all of the VAPs to calculate everything.
 	 *
 	 * There are a few different flags to calculate:
 	 *
 	 * + whether there's HT only or HT+legacy stations;
 	 * + whether there's HT20, HT40, or HT20+HT40 stations;
 	 * + whether the desired protection mode is mixed, pure or
 	 *   one of the two above.
 	 *
 	 * For now we assume that if a driver can handle this per-VAP
 	 * then it'll ignore the ic->ic_htprotmode / ic->ic_curhtprotmode
 	 * variant and instead will look at the vap related variables.
 	 *
 	 * XXX TODO: non-greenfield STAs present (IEEE80211_HTINFO_NONGF_PRESENT) !
 	 */
 
 	IEEE80211_LOCK(ic);
 	TAILQ_FOREACH(iv, &ic->ic_vaps, iv_next) {
 		num_vaps++;
 		/* overlapping BSSes advertising non-HT status present */
 		if (iv->iv_flags_ht & IEEE80211_FHT_NONHT_PR)
 			num_nonht++;
 		/* Operating mode flags */
 		if (iv->iv_curhtprotmode & IEEE80211_HTINFO_NONHT_PRESENT)
 			num_nonhtpr++;
 		switch (iv->iv_curhtprotmode & IEEE80211_HTINFO_OPMODE) {
 		case IEEE80211_HTINFO_OPMODE_PURE:
 			num_pure++;
 			break;
 		case IEEE80211_HTINFO_OPMODE_PROTOPT:
 			num_optional++;
 			break;
 		case IEEE80211_HTINFO_OPMODE_HT20PR:
 			num_ht2040++;
 			break;
 		}
 
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_11N,
 		    "%s: vap %s: nonht_pr=%d, curhtprotmode=0x%02x\n",
 		    __func__,
 		    ieee80211_get_vap_ifname(iv),
 		    !! (iv->iv_flags_ht & IEEE80211_FHT_NONHT_PR),
 		    iv->iv_curhtprotmode);
 
 		num_ht_sta += iv->iv_ht_sta_assoc;
 		num_ht40_sta += iv->iv_ht40_sta_assoc;
 		num_sta += iv->iv_sta_assoc;
 	}
 
 	/*
 	 * Step 1 - if any VAPs indicate NONHT_PR set (overlapping BSS
 	 * non-HT present), set it here.  This shouldn't be used by
 	 * anything but the old overlapping BSS logic so if any drivers
 	 * consume it, it's up to date.
 	 */
 	if (num_nonht > 0)
 		ic->ic_flags_ht |= IEEE80211_FHT_NONHT_PR;
 	else
 		ic->ic_flags_ht &= ~IEEE80211_FHT_NONHT_PR;
 
 	/*
 	 * Step 2 - default HT protection mode to MIXED (802.11-2016 10.26.3.1.)
 	 *
 	 * + If all VAPs are PURE, we can stay PURE.
 	 * + If all VAPs are PROTOPT, we can go to PROTOPT.
 	 * + If any VAP has HT20PR then it sees at least a HT40+HT20 station.
 	 *   Note that we may have a VAP with one HT20 and a VAP with one HT40;
 	 *   So we look at the sum ht and sum ht40 sta counts; if we have a
 	 *   HT station and the HT20 != HT40 count, we have to do HT20PR here.
 	 *   Note all stations need to be HT for this to be an option.
 	 * + The fall-through is MIXED, because it means we have some odd
 	 *   non HT40-involved combination of opmode and this is the most
 	 *   sensible default.
 	 */
 	ic->ic_curhtprotmode = IEEE80211_HTINFO_OPMODE_MIXED;
 
 	if (num_pure == num_vaps)
 		ic->ic_curhtprotmode = IEEE80211_HTINFO_OPMODE_PURE;
 
 	if (num_optional == num_vaps)
 		ic->ic_curhtprotmode = IEEE80211_HTINFO_OPMODE_PROTOPT;
 
 	/*
 	 * Note: we need /a/ HT40 station somewhere for this to
 	 * be a possibility.
 	 */
 	if ((num_ht2040 > 0) ||
 	    ((num_ht_sta > 0) && (num_ht40_sta > 0) &&
 	     (num_ht_sta != num_ht40_sta)))
 		ic->ic_curhtprotmode = IEEE80211_HTINFO_OPMODE_HT20PR;
 
 	/*
 	 * Step 3 - if any of the stations across the VAPs are
 	 * non-HT then this needs to be flipped back to MIXED.
 	 */
 	if (num_ht_sta != num_sta)
 		ic->ic_curhtprotmode = IEEE80211_HTINFO_OPMODE_MIXED;
 
 	/*
 	 * Step 4 - If we see any overlapping BSS non-HT stations
 	 * via beacons then flip on NONHT_PRESENT.
 	 */
 	if (num_nonhtpr > 0)
 		ic->ic_curhtprotmode |= IEEE80211_HTINFO_NONHT_PRESENT;
 
 	/* Notify all VAPs to potentially update their beacons */
 	TAILQ_FOREACH(iv, &ic->ic_vaps, iv_next)
 		ieee80211_htinfo_notify(iv);
 
 	IEEE80211_UNLOCK(ic);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_11N,
 	  "%s: global: nonht_pr=%d ht_opmode=0x%02x\n",
 	  __func__,
 	  !! (ic->ic_flags_ht & IEEE80211_FHT_NONHT_PR),
 	  ic->ic_curhtprotmode);
 
 	/* Driver update */
 	if (vap->iv_erp_protmode_update != NULL)
 		vap->iv_ht_protmode_update(vap);
 }
 
 /*
  * Set the short slot time state and notify the driver.
  *
  * This is the per-VAP slot time state.
  */
 void
 ieee80211_vap_set_shortslottime(struct ieee80211vap *vap, int onoff)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	/* XXX lock? */
 
 	/*
 	 * Only modify the per-VAP slot time.
 	 */
 	if (onoff)
 		vap->iv_flags |= IEEE80211_F_SHSLOT;
 	else
 		vap->iv_flags &= ~IEEE80211_F_SHSLOT;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG,
 	    "%s: called; onoff=%d\n", __func__, onoff);
 	/* schedule the deferred slot flag update and update */
 	ieee80211_runtask(ic, &vap->iv_slot_task);
 }
 
 /*
  * Update the VAP short /long / barker preamble state and
  * update beacon state if needed.
  *
  * For now it simply copies the global flags into the per-vap
  * flags and schedules the callback.  Later this will support
  * both global and per-VAP flags, especially useful for
  * and STA+STA multi-channel operation (eg p2p).
  */
 void
 ieee80211_vap_update_preamble(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	/* XXX lock? */
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG,
 	    "%s: called\n", __func__);
 	/* schedule the deferred slot flag update and update */
 	ieee80211_runtask(ic, &vap->iv_preamble_task);
 }
 
 /*
  * Update the VAP 11g protection mode and update beacon state
  * if needed.
  */
 void
 ieee80211_vap_update_erp_protmode(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	/* XXX lock? */
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG,
 	    "%s: called\n", __func__);
 	/* schedule the deferred slot flag update and update */
 	ieee80211_runtask(ic, &vap->iv_erp_protmode_task);
 }
 
 /*
  * Update the VAP 11n protection mode and update beacon state
  * if needed.
  */
 void
 ieee80211_vap_update_ht_protmode(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	/* XXX lock? */
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_DEBUG,
 	    "%s: called\n", __func__);
 	/* schedule the deferred protmode update */
 	ieee80211_runtask(ic, &vap->iv_ht_protmode_task);
 }
 
 /*
  * Check if the specified rate set supports ERP.
  * NB: the rate set is assumed to be sorted.
  */
 int
 ieee80211_iserp_rateset(const struct ieee80211_rateset *rs)
 {
 	static const int rates[] = { 2, 4, 11, 22, 12, 24, 48 };
 	int i, j;
 
 	if (rs->rs_nrates < nitems(rates))
 		return 0;
 	for (i = 0; i < nitems(rates); i++) {
 		for (j = 0; j < rs->rs_nrates; j++) {
 			int r = rs->rs_rates[j] & IEEE80211_RATE_VAL;
 			if (rates[i] == r)
 				goto next;
 			if (r > rates[i])
 				return 0;
 		}
 		return 0;
 	next:
 		;
 	}
 	return 1;
 }
 
 /*
  * Mark the basic rates for the rate table based on the
  * operating mode.  For real 11g we mark all the 11b rates
  * and 6, 12, and 24 OFDM.  For 11b compatibility we mark only
  * 11b rates.  There's also a pseudo 11a-mode used to mark only
  * the basic OFDM rates.
  */
 static void
 setbasicrates(struct ieee80211_rateset *rs,
     enum ieee80211_phymode mode, int add)
 {
 	static const struct ieee80211_rateset basic[IEEE80211_MODE_MAX] = {
 	    [IEEE80211_MODE_11A]	= { 3, { 12, 24, 48 } },
 	    [IEEE80211_MODE_11B]	= { 2, { 2, 4 } },
 					    /* NB: mixed b/g */
 	    [IEEE80211_MODE_11G]	= { 4, { 2, 4, 11, 22 } },
 	    [IEEE80211_MODE_TURBO_A]	= { 3, { 12, 24, 48 } },
 	    [IEEE80211_MODE_TURBO_G]	= { 4, { 2, 4, 11, 22 } },
 	    [IEEE80211_MODE_STURBO_A]	= { 3, { 12, 24, 48 } },
 	    [IEEE80211_MODE_HALF]	= { 3, { 6, 12, 24 } },
 	    [IEEE80211_MODE_QUARTER]	= { 3, { 3, 6, 12 } },
 	    [IEEE80211_MODE_11NA]	= { 3, { 12, 24, 48 } },
 					    /* NB: mixed b/g */
 	    [IEEE80211_MODE_11NG]	= { 4, { 2, 4, 11, 22 } },
 					    /* NB: mixed b/g */
 	    [IEEE80211_MODE_VHT_2GHZ]	= { 4, { 2, 4, 11, 22 } },
 	    [IEEE80211_MODE_VHT_5GHZ]	= { 3, { 12, 24, 48 } },
 	};
 	int i, j;
 
 	for (i = 0; i < rs->rs_nrates; i++) {
 		if (!add)
 			rs->rs_rates[i] &= IEEE80211_RATE_VAL;
 		for (j = 0; j < basic[mode].rs_nrates; j++)
 			if (basic[mode].rs_rates[j] == rs->rs_rates[i]) {
 				rs->rs_rates[i] |= IEEE80211_RATE_BASIC;
 				break;
 			}
 	}
 }
 
 /*
  * Set the basic rates in a rate set.
  */
 void
 ieee80211_setbasicrates(struct ieee80211_rateset *rs,
     enum ieee80211_phymode mode)
 {
 	setbasicrates(rs, mode, 0);
 }
 
 /*
  * Add basic rates to a rate set.
  */
 void
 ieee80211_addbasicrates(struct ieee80211_rateset *rs,
     enum ieee80211_phymode mode)
 {
 	setbasicrates(rs, mode, 1);
 }
 
 /*
  * WME protocol support.
  *
  * The default 11a/b/g/n parameters come from the WiFi Alliance WMM
  * System Interopability Test Plan (v1.4, Appendix F) and the 802.11n
  * Draft 2.0 Test Plan (Appendix D).
  *
  * Static/Dynamic Turbo mode settings come from Atheros.
  */
 typedef struct phyParamType {
 	uint8_t		aifsn;
 	uint8_t		logcwmin;
 	uint8_t		logcwmax;
 	uint16_t	txopLimit;
 	uint8_t 	acm;
 } paramType;
 
 static const struct phyParamType phyParamForAC_BE[IEEE80211_MODE_MAX] = {
 	[IEEE80211_MODE_AUTO]	= { 3, 4,  6,  0, 0 },
 	[IEEE80211_MODE_11A]	= { 3, 4,  6,  0, 0 },
 	[IEEE80211_MODE_11B]	= { 3, 4,  6,  0, 0 },
 	[IEEE80211_MODE_11G]	= { 3, 4,  6,  0, 0 },
 	[IEEE80211_MODE_FH]	= { 3, 4,  6,  0, 0 },
 	[IEEE80211_MODE_TURBO_A]= { 2, 3,  5,  0, 0 },
 	[IEEE80211_MODE_TURBO_G]= { 2, 3,  5,  0, 0 },
 	[IEEE80211_MODE_STURBO_A]={ 2, 3,  5,  0, 0 },
 	[IEEE80211_MODE_HALF]	= { 3, 4,  6,  0, 0 },
 	[IEEE80211_MODE_QUARTER]= { 3, 4,  6,  0, 0 },
 	[IEEE80211_MODE_11NA]	= { 3, 4,  6,  0, 0 },
 	[IEEE80211_MODE_11NG]	= { 3, 4,  6,  0, 0 },
 	[IEEE80211_MODE_VHT_2GHZ]	= { 3, 4,  6,  0, 0 },
 	[IEEE80211_MODE_VHT_5GHZ]	= { 3, 4,  6,  0, 0 },
 };
 static const struct phyParamType phyParamForAC_BK[IEEE80211_MODE_MAX] = {
 	[IEEE80211_MODE_AUTO]	= { 7, 4, 10,  0, 0 },
 	[IEEE80211_MODE_11A]	= { 7, 4, 10,  0, 0 },
 	[IEEE80211_MODE_11B]	= { 7, 4, 10,  0, 0 },
 	[IEEE80211_MODE_11G]	= { 7, 4, 10,  0, 0 },
 	[IEEE80211_MODE_FH]	= { 7, 4, 10,  0, 0 },
 	[IEEE80211_MODE_TURBO_A]= { 7, 3, 10,  0, 0 },
 	[IEEE80211_MODE_TURBO_G]= { 7, 3, 10,  0, 0 },
 	[IEEE80211_MODE_STURBO_A]={ 7, 3, 10,  0, 0 },
 	[IEEE80211_MODE_HALF]	= { 7, 4, 10,  0, 0 },
 	[IEEE80211_MODE_QUARTER]= { 7, 4, 10,  0, 0 },
 	[IEEE80211_MODE_11NA]	= { 7, 4, 10,  0, 0 },
 	[IEEE80211_MODE_11NG]	= { 7, 4, 10,  0, 0 },
 	[IEEE80211_MODE_VHT_2GHZ]	= { 7, 4, 10,  0, 0 },
 	[IEEE80211_MODE_VHT_5GHZ]	= { 7, 4, 10,  0, 0 },
 };
 static const struct phyParamType phyParamForAC_VI[IEEE80211_MODE_MAX] = {
 	[IEEE80211_MODE_AUTO]	= { 1, 3, 4,  94, 0 },
 	[IEEE80211_MODE_11A]	= { 1, 3, 4,  94, 0 },
 	[IEEE80211_MODE_11B]	= { 1, 3, 4, 188, 0 },
 	[IEEE80211_MODE_11G]	= { 1, 3, 4,  94, 0 },
 	[IEEE80211_MODE_FH]	= { 1, 3, 4, 188, 0 },
 	[IEEE80211_MODE_TURBO_A]= { 1, 2, 3,  94, 0 },
 	[IEEE80211_MODE_TURBO_G]= { 1, 2, 3,  94, 0 },
 	[IEEE80211_MODE_STURBO_A]={ 1, 2, 3,  94, 0 },
 	[IEEE80211_MODE_HALF]	= { 1, 3, 4,  94, 0 },
 	[IEEE80211_MODE_QUARTER]= { 1, 3, 4,  94, 0 },
 	[IEEE80211_MODE_11NA]	= { 1, 3, 4,  94, 0 },
 	[IEEE80211_MODE_11NG]	= { 1, 3, 4,  94, 0 },
 	[IEEE80211_MODE_VHT_2GHZ]	= { 1, 3, 4,  94, 0 },
 	[IEEE80211_MODE_VHT_5GHZ]	= { 1, 3, 4,  94, 0 },
 };
 static const struct phyParamType phyParamForAC_VO[IEEE80211_MODE_MAX] = {
 	[IEEE80211_MODE_AUTO]	= { 1, 2, 3,  47, 0 },
 	[IEEE80211_MODE_11A]	= { 1, 2, 3,  47, 0 },
 	[IEEE80211_MODE_11B]	= { 1, 2, 3, 102, 0 },
 	[IEEE80211_MODE_11G]	= { 1, 2, 3,  47, 0 },
 	[IEEE80211_MODE_FH]	= { 1, 2, 3, 102, 0 },
 	[IEEE80211_MODE_TURBO_A]= { 1, 2, 2,  47, 0 },
 	[IEEE80211_MODE_TURBO_G]= { 1, 2, 2,  47, 0 },
 	[IEEE80211_MODE_STURBO_A]={ 1, 2, 2,  47, 0 },
 	[IEEE80211_MODE_HALF]	= { 1, 2, 3,  47, 0 },
 	[IEEE80211_MODE_QUARTER]= { 1, 2, 3,  47, 0 },
 	[IEEE80211_MODE_11NA]	= { 1, 2, 3,  47, 0 },
 	[IEEE80211_MODE_11NG]	= { 1, 2, 3,  47, 0 },
 	[IEEE80211_MODE_VHT_2GHZ]	= { 1, 2, 3,  47, 0 },
 	[IEEE80211_MODE_VHT_5GHZ]	= { 1, 2, 3,  47, 0 },
 };
 
 static const struct phyParamType bssPhyParamForAC_BE[IEEE80211_MODE_MAX] = {
 	[IEEE80211_MODE_AUTO]	= { 3, 4, 10,  0, 0 },
 	[IEEE80211_MODE_11A]	= { 3, 4, 10,  0, 0 },
 	[IEEE80211_MODE_11B]	= { 3, 4, 10,  0, 0 },
 	[IEEE80211_MODE_11G]	= { 3, 4, 10,  0, 0 },
 	[IEEE80211_MODE_FH]	= { 3, 4, 10,  0, 0 },
 	[IEEE80211_MODE_TURBO_A]= { 2, 3, 10,  0, 0 },
 	[IEEE80211_MODE_TURBO_G]= { 2, 3, 10,  0, 0 },
 	[IEEE80211_MODE_STURBO_A]={ 2, 3, 10,  0, 0 },
 	[IEEE80211_MODE_HALF]	= { 3, 4, 10,  0, 0 },
 	[IEEE80211_MODE_QUARTER]= { 3, 4, 10,  0, 0 },
 	[IEEE80211_MODE_11NA]	= { 3, 4, 10,  0, 0 },
 	[IEEE80211_MODE_11NG]	= { 3, 4, 10,  0, 0 },
 };
 static const struct phyParamType bssPhyParamForAC_VI[IEEE80211_MODE_MAX] = {
 	[IEEE80211_MODE_AUTO]	= { 2, 3, 4,  94, 0 },
 	[IEEE80211_MODE_11A]	= { 2, 3, 4,  94, 0 },
 	[IEEE80211_MODE_11B]	= { 2, 3, 4, 188, 0 },
 	[IEEE80211_MODE_11G]	= { 2, 3, 4,  94, 0 },
 	[IEEE80211_MODE_FH]	= { 2, 3, 4, 188, 0 },
 	[IEEE80211_MODE_TURBO_A]= { 2, 2, 3,  94, 0 },
 	[IEEE80211_MODE_TURBO_G]= { 2, 2, 3,  94, 0 },
 	[IEEE80211_MODE_STURBO_A]={ 2, 2, 3,  94, 0 },
 	[IEEE80211_MODE_HALF]	= { 2, 3, 4,  94, 0 },
 	[IEEE80211_MODE_QUARTER]= { 2, 3, 4,  94, 0 },
 	[IEEE80211_MODE_11NA]	= { 2, 3, 4,  94, 0 },
 	[IEEE80211_MODE_11NG]	= { 2, 3, 4,  94, 0 },
 };
 static const struct phyParamType bssPhyParamForAC_VO[IEEE80211_MODE_MAX] = {
 	[IEEE80211_MODE_AUTO]	= { 2, 2, 3,  47, 0 },
 	[IEEE80211_MODE_11A]	= { 2, 2, 3,  47, 0 },
 	[IEEE80211_MODE_11B]	= { 2, 2, 3, 102, 0 },
 	[IEEE80211_MODE_11G]	= { 2, 2, 3,  47, 0 },
 	[IEEE80211_MODE_FH]	= { 2, 2, 3, 102, 0 },
 	[IEEE80211_MODE_TURBO_A]= { 1, 2, 2,  47, 0 },
 	[IEEE80211_MODE_TURBO_G]= { 1, 2, 2,  47, 0 },
 	[IEEE80211_MODE_STURBO_A]={ 1, 2, 2,  47, 0 },
 	[IEEE80211_MODE_HALF]	= { 2, 2, 3,  47, 0 },
 	[IEEE80211_MODE_QUARTER]= { 2, 2, 3,  47, 0 },
 	[IEEE80211_MODE_11NA]	= { 2, 2, 3,  47, 0 },
 	[IEEE80211_MODE_11NG]	= { 2, 2, 3,  47, 0 },
 };
 
 static void
 _setifsparams(struct wmeParams *wmep, const paramType *phy)
 {
 	wmep->wmep_aifsn = phy->aifsn;
 	wmep->wmep_logcwmin = phy->logcwmin;	
 	wmep->wmep_logcwmax = phy->logcwmax;		
 	wmep->wmep_txopLimit = phy->txopLimit;
 }
 
 static void
 setwmeparams(struct ieee80211vap *vap, const char *type, int ac,
 	struct wmeParams *wmep, const paramType *phy)
 {
 	wmep->wmep_acm = phy->acm;
 	_setifsparams(wmep, phy);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_WME,
 	    "set %s (%s) [acm %u aifsn %u logcwmin %u logcwmax %u txop %u]\n",
 	    ieee80211_wme_acnames[ac], type,
 	    wmep->wmep_acm, wmep->wmep_aifsn, wmep->wmep_logcwmin,
 	    wmep->wmep_logcwmax, wmep->wmep_txopLimit);
 }
 
 static void
 ieee80211_wme_initparams_locked(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_wme_state *wme = &ic->ic_wme;
 	const paramType *pPhyParam, *pBssPhyParam;
 	struct wmeParams *wmep;
 	enum ieee80211_phymode mode;
 	int i;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	if ((ic->ic_caps & IEEE80211_C_WME) == 0 || ic->ic_nrunning > 1)
 		return;
 
 	/*
 	 * Clear the wme cap_info field so a qoscount from a previous
 	 * vap doesn't confuse later code which only parses the beacon
 	 * field and updates hardware when said field changes.
 	 * Otherwise the hardware is programmed with defaults, not what
 	 * the beacon actually announces.
 	 *
 	 * Note that we can't ever have 0xff as an actual value;
 	 * the only valid values are 0..15.
 	 */
 	wme->wme_wmeChanParams.cap_info = 0xfe;
 
 	/*
 	 * Select mode; we can be called early in which case we
 	 * always use auto mode.  We know we'll be called when
 	 * entering the RUN state with bsschan setup properly
 	 * so state will eventually get set correctly
 	 */
 	if (ic->ic_bsschan != IEEE80211_CHAN_ANYC)
 		mode = ieee80211_chan2mode(ic->ic_bsschan);
 	else
 		mode = IEEE80211_MODE_AUTO;
 	for (i = 0; i < WME_NUM_AC; i++) {
 		switch (i) {
 		case WME_AC_BK:
 			pPhyParam = &phyParamForAC_BK[mode];
 			pBssPhyParam = &phyParamForAC_BK[mode];
 			break;
 		case WME_AC_VI:
 			pPhyParam = &phyParamForAC_VI[mode];
 			pBssPhyParam = &bssPhyParamForAC_VI[mode];
 			break;
 		case WME_AC_VO:
 			pPhyParam = &phyParamForAC_VO[mode];
 			pBssPhyParam = &bssPhyParamForAC_VO[mode];
 			break;
 		case WME_AC_BE:
 		default:
 			pPhyParam = &phyParamForAC_BE[mode];
 			pBssPhyParam = &bssPhyParamForAC_BE[mode];
 			break;
 		}
 		wmep = &wme->wme_wmeChanParams.cap_wmeParams[i];
 		if (ic->ic_opmode == IEEE80211_M_HOSTAP) {
 			setwmeparams(vap, "chan", i, wmep, pPhyParam);
 		} else {
 			setwmeparams(vap, "chan", i, wmep, pBssPhyParam);
 		}	
 		wmep = &wme->wme_wmeBssChanParams.cap_wmeParams[i];
 		setwmeparams(vap, "bss ", i, wmep, pBssPhyParam);
 	}
 	/* NB: check ic_bss to avoid NULL deref on initial attach */
 	if (vap->iv_bss != NULL) {
 		/*
 		 * Calculate aggressive mode switching threshold based
 		 * on beacon interval.  This doesn't need locking since
 		 * we're only called before entering the RUN state at
 		 * which point we start sending beacon frames.
 		 */
 		wme->wme_hipri_switch_thresh =
 			(HIGH_PRI_SWITCH_THRESH * vap->iv_bss->ni_intval) / 100;
 		wme->wme_flags &= ~WME_F_AGGRMODE;
 		ieee80211_wme_updateparams(vap);
 	}
 }
 
 void
 ieee80211_wme_initparams(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK(ic);
 	ieee80211_wme_initparams_locked(vap);
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Update WME parameters for ourself and the BSS.
  */
 void
 ieee80211_wme_updateparams_locked(struct ieee80211vap *vap)
 {
 	static const paramType aggrParam[IEEE80211_MODE_MAX] = {
 	    [IEEE80211_MODE_AUTO]	= { 2, 4, 10, 64, 0 },
 	    [IEEE80211_MODE_11A]	= { 2, 4, 10, 64, 0 },
 	    [IEEE80211_MODE_11B]	= { 2, 5, 10, 64, 0 },
 	    [IEEE80211_MODE_11G]	= { 2, 4, 10, 64, 0 },
 	    [IEEE80211_MODE_FH]		= { 2, 5, 10, 64, 0 },
 	    [IEEE80211_MODE_TURBO_A]	= { 1, 3, 10, 64, 0 },
 	    [IEEE80211_MODE_TURBO_G]	= { 1, 3, 10, 64, 0 },
 	    [IEEE80211_MODE_STURBO_A]	= { 1, 3, 10, 64, 0 },
 	    [IEEE80211_MODE_HALF]	= { 2, 4, 10, 64, 0 },
 	    [IEEE80211_MODE_QUARTER]	= { 2, 4, 10, 64, 0 },
 	    [IEEE80211_MODE_11NA]	= { 2, 4, 10, 64, 0 },	/* XXXcheck*/
 	    [IEEE80211_MODE_11NG]	= { 2, 4, 10, 64, 0 },	/* XXXcheck*/
 	    [IEEE80211_MODE_VHT_2GHZ]	= { 2, 4, 10, 64, 0 },	/* XXXcheck*/
 	    [IEEE80211_MODE_VHT_5GHZ]	= { 2, 4, 10, 64, 0 },	/* XXXcheck*/
 	};
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_wme_state *wme = &ic->ic_wme;
 	const struct wmeParams *wmep;
 	struct wmeParams *chanp, *bssp;
 	enum ieee80211_phymode mode;
 	int i;
 	int do_aggrmode = 0;
 
        	/*
 	 * Set up the channel access parameters for the physical
 	 * device.  First populate the configured settings.
 	 */
 	for (i = 0; i < WME_NUM_AC; i++) {
 		chanp = &wme->wme_chanParams.cap_wmeParams[i];
 		wmep = &wme->wme_wmeChanParams.cap_wmeParams[i];
 		chanp->wmep_aifsn = wmep->wmep_aifsn;
 		chanp->wmep_logcwmin = wmep->wmep_logcwmin;
 		chanp->wmep_logcwmax = wmep->wmep_logcwmax;
 		chanp->wmep_txopLimit = wmep->wmep_txopLimit;
 
 		chanp = &wme->wme_bssChanParams.cap_wmeParams[i];
 		wmep = &wme->wme_wmeBssChanParams.cap_wmeParams[i];
 		chanp->wmep_aifsn = wmep->wmep_aifsn;
 		chanp->wmep_logcwmin = wmep->wmep_logcwmin;
 		chanp->wmep_logcwmax = wmep->wmep_logcwmax;
 		chanp->wmep_txopLimit = wmep->wmep_txopLimit;
 	}
 
 	/*
 	 * Select mode; we can be called early in which case we
 	 * always use auto mode.  We know we'll be called when
 	 * entering the RUN state with bsschan setup properly
 	 * so state will eventually get set correctly
 	 */
 	if (ic->ic_bsschan != IEEE80211_CHAN_ANYC)
 		mode = ieee80211_chan2mode(ic->ic_bsschan);
 	else
 		mode = IEEE80211_MODE_AUTO;
 
 	/*
 	 * This implements aggressive mode as found in certain
 	 * vendors' AP's.  When there is significant high
 	 * priority (VI/VO) traffic in the BSS throttle back BE
 	 * traffic by using conservative parameters.  Otherwise
 	 * BE uses aggressive params to optimize performance of
 	 * legacy/non-QoS traffic.
 	 */
 
 	/* Hostap? Only if aggressive mode is enabled */
         if (vap->iv_opmode == IEEE80211_M_HOSTAP &&
 	     (wme->wme_flags & WME_F_AGGRMODE) != 0)
 		do_aggrmode = 1;
 
 	/*
 	 * Station? Only if we're in a non-QoS BSS.
 	 */
 	else if ((vap->iv_opmode == IEEE80211_M_STA &&
 	     (vap->iv_bss->ni_flags & IEEE80211_NODE_QOS) == 0))
 		do_aggrmode = 1;
 
 	/*
 	 * IBSS? Only if we we have WME enabled.
 	 */
 	else if ((vap->iv_opmode == IEEE80211_M_IBSS) &&
 	    (vap->iv_flags & IEEE80211_F_WME))
 		do_aggrmode = 1;
 
 	/*
 	 * If WME is disabled on this VAP, default to aggressive mode
 	 * regardless of the configuration.
 	 */
 	if ((vap->iv_flags & IEEE80211_F_WME) == 0)
 		do_aggrmode = 1;
 
 	/* XXX WDS? */
 
 	/* XXX MBSS? */
 
 	if (do_aggrmode) {
 		chanp = &wme->wme_chanParams.cap_wmeParams[WME_AC_BE];
 		bssp = &wme->wme_bssChanParams.cap_wmeParams[WME_AC_BE];
 
 		chanp->wmep_aifsn = bssp->wmep_aifsn = aggrParam[mode].aifsn;
 		chanp->wmep_logcwmin = bssp->wmep_logcwmin =
 		    aggrParam[mode].logcwmin;
 		chanp->wmep_logcwmax = bssp->wmep_logcwmax =
 		    aggrParam[mode].logcwmax;
 		chanp->wmep_txopLimit = bssp->wmep_txopLimit =
 		    (vap->iv_flags & IEEE80211_F_BURST) ?
 			aggrParam[mode].txopLimit : 0;		
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_WME,
 		    "update %s (chan+bss) [acm %u aifsn %u logcwmin %u "
 		    "logcwmax %u txop %u]\n", ieee80211_wme_acnames[WME_AC_BE],
 		    chanp->wmep_acm, chanp->wmep_aifsn, chanp->wmep_logcwmin,
 		    chanp->wmep_logcwmax, chanp->wmep_txopLimit);
 	}
 
 	/*
 	 * Change the contention window based on the number of associated
 	 * stations.  If the number of associated stations is 1 and
 	 * aggressive mode is enabled, lower the contention window even
 	 * further.
 	 */
 	if (vap->iv_opmode == IEEE80211_M_HOSTAP &&
 	    vap->iv_sta_assoc < 2 && (wme->wme_flags & WME_F_AGGRMODE) != 0) {
 		static const uint8_t logCwMin[IEEE80211_MODE_MAX] = {
 		    [IEEE80211_MODE_AUTO]	= 3,
 		    [IEEE80211_MODE_11A]	= 3,
 		    [IEEE80211_MODE_11B]	= 4,
 		    [IEEE80211_MODE_11G]	= 3,
 		    [IEEE80211_MODE_FH]		= 4,
 		    [IEEE80211_MODE_TURBO_A]	= 3,
 		    [IEEE80211_MODE_TURBO_G]	= 3,
 		    [IEEE80211_MODE_STURBO_A]	= 3,
 		    [IEEE80211_MODE_HALF]	= 3,
 		    [IEEE80211_MODE_QUARTER]	= 3,
 		    [IEEE80211_MODE_11NA]	= 3,
 		    [IEEE80211_MODE_11NG]	= 3,
 		    [IEEE80211_MODE_VHT_2GHZ]	= 3,
 		    [IEEE80211_MODE_VHT_5GHZ]	= 3,
 		};
 		chanp = &wme->wme_chanParams.cap_wmeParams[WME_AC_BE];
 		bssp = &wme->wme_bssChanParams.cap_wmeParams[WME_AC_BE];
 
 		chanp->wmep_logcwmin = bssp->wmep_logcwmin = logCwMin[mode];
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_WME,
 		    "update %s (chan+bss) logcwmin %u\n",
 		    ieee80211_wme_acnames[WME_AC_BE], chanp->wmep_logcwmin);
 	}
 
 	/* schedule the deferred WME update */
 	ieee80211_runtask(ic, &vap->iv_wme_task);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_WME,
 	    "%s: WME params updated, cap_info 0x%x\n", __func__,
 	    vap->iv_opmode == IEEE80211_M_STA ?
 		wme->wme_wmeChanParams.cap_info :
 		wme->wme_bssChanParams.cap_info);
 }
 
 void
 ieee80211_wme_updateparams(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	if (ic->ic_caps & IEEE80211_C_WME) {
 		IEEE80211_LOCK(ic);
 		ieee80211_wme_updateparams_locked(vap);
 		IEEE80211_UNLOCK(ic);
 	}
 }
 
 /*
  * Fetch the WME parameters for the given VAP.
  *
  * When net80211 grows p2p, etc support, this may return different
  * parameters for each VAP.
  */
 void
 ieee80211_wme_vap_getparams(struct ieee80211vap *vap, struct chanAccParams *wp)
 {
 
 	memcpy(wp, &vap->iv_ic->ic_wme.wme_chanParams, sizeof(*wp));
 }
 
 /*
  * For NICs which only support one set of WME parameters (ie, softmac NICs)
  * there may be different VAP WME parameters but only one is "active".
  * This returns the "NIC" WME parameters for the currently active
  * context.
  */
 void
 ieee80211_wme_ic_getparams(struct ieee80211com *ic, struct chanAccParams *wp)
 {
 
 	memcpy(wp, &ic->ic_wme.wme_chanParams, sizeof(*wp));
 }
 
 /*
  * Return whether to use QoS on a given WME queue.
  *
  * This is intended to be called from the transmit path of softmac drivers
  * which are setting NoAck bits in transmit descriptors.
  *
  * Ideally this would be set in some transmit field before the packet is
  * queued to the driver but net80211 isn't quite there yet.
  */
 int
 ieee80211_wme_vap_ac_is_noack(struct ieee80211vap *vap, int ac)
 {
 	/* Bounds/sanity check */
 	if (ac < 0 || ac >= WME_NUM_AC)
 		return (0);
 
 	/* Again, there's only one global context for now */
 	return (!! vap->iv_ic->ic_wme.wme_chanParams.cap_wmeParams[ac].wmep_noackPolicy);
 }
 
 static void
 parent_updown(void *arg, int npending)
 {
 	struct ieee80211com *ic = arg;
 
 	ic->ic_parent(ic);
 }
 
 static void
 update_mcast(void *arg, int npending)
 {
 	struct ieee80211com *ic = arg;
 
 	ic->ic_update_mcast(ic);
 }
 
 static void
 update_promisc(void *arg, int npending)
 {
 	struct ieee80211com *ic = arg;
 
 	ic->ic_update_promisc(ic);
 }
 
 static void
 update_channel(void *arg, int npending)
 {
 	struct ieee80211com *ic = arg;
 
 	ic->ic_set_channel(ic);
 	ieee80211_radiotap_chan_change(ic);
 }
 
 static void
 update_chw(void *arg, int npending)
 {
 	struct ieee80211com *ic = arg;
 
 	/*
 	 * XXX should we defer the channel width _config_ update until now?
 	 */
 	ic->ic_update_chw(ic);
 }
 
 /*
  * Deferred WME parameter and beacon update.
  *
  * In preparation for per-VAP WME configuration, call the VAP
  * method if the VAP requires it.  Otherwise, just call the
  * older global method.  There isn't a per-VAP WME configuration
  * just yet so for now just use the global configuration.
  */
 static void
 vap_update_wme(void *arg, int npending)
 {
 	struct ieee80211vap *vap = arg;
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_wme_state *wme = &ic->ic_wme;
 
 	/* Driver update */
 	if (vap->iv_wme_update != NULL)
 		vap->iv_wme_update(vap,
 		    ic->ic_wme.wme_chanParams.cap_wmeParams);
 	else
 		ic->ic_wme.wme_update(ic);
 
 	IEEE80211_LOCK(ic);
 	/*
 	 * Arrange for the beacon update.
 	 *
 	 * XXX what about MBSS, WDS?
 	 */
 	if (vap->iv_opmode == IEEE80211_M_HOSTAP
 	    || vap->iv_opmode == IEEE80211_M_IBSS) {
 		/*
 		 * Arrange for a beacon update and bump the parameter
 		 * set number so associated stations load the new values.
 		 */
 		wme->wme_bssChanParams.cap_info =
 			(wme->wme_bssChanParams.cap_info+1) & WME_QOSINFO_COUNT;
 		ieee80211_beacon_notify(vap, IEEE80211_BEACON_WME);
 	}
 	IEEE80211_UNLOCK(ic);
 }
 
 static void
 restart_vaps(void *arg, int npending)
 {
 	struct ieee80211com *ic = arg;
 
 	ieee80211_suspend_all(ic);
 	ieee80211_resume_all(ic);
 }
 
 /*
  * Block until the parent is in a known state.  This is
  * used after any operations that dispatch a task (e.g.
  * to auto-configure the parent device up/down).
  */
 void
 ieee80211_waitfor_parent(struct ieee80211com *ic)
 {
 	taskqueue_block(ic->ic_tq);
 	ieee80211_draintask(ic, &ic->ic_parent_task);
 	ieee80211_draintask(ic, &ic->ic_mcast_task);
 	ieee80211_draintask(ic, &ic->ic_promisc_task);
 	ieee80211_draintask(ic, &ic->ic_chan_task);
 	ieee80211_draintask(ic, &ic->ic_bmiss_task);
 	ieee80211_draintask(ic, &ic->ic_chw_task);
 	taskqueue_unblock(ic->ic_tq);
 }
 
 /*
  * Check to see whether the current channel needs reset.
  *
  * Some devices don't handle being given an invalid channel
  * in their operating mode very well (eg wpi(4) will throw a
  * firmware exception.)
  *
  * Return 0 if we're ok, 1 if the channel needs to be reset.
  *
  * See PR kern/202502.
  */
 static int
 ieee80211_start_check_reset_chan(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	if ((vap->iv_opmode == IEEE80211_M_IBSS &&
 	     IEEE80211_IS_CHAN_NOADHOC(ic->ic_curchan)) ||
 	    (vap->iv_opmode == IEEE80211_M_HOSTAP &&
 	     IEEE80211_IS_CHAN_NOHOSTAP(ic->ic_curchan)))
 		return (1);
 	return (0);
 }
 
 /*
  * Reset the curchan to a known good state.
  */
 static void
 ieee80211_start_reset_chan(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	ic->ic_curchan = &ic->ic_channels[0];
 }
 
 /*
  * Start a vap running.  If this is the first vap to be
  * set running on the underlying device then we
  * automatically bring the device up.
  */
 void
 ieee80211_start_locked(struct ieee80211vap *vap)
 {
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	IEEE80211_DPRINTF(vap,
 		IEEE80211_MSG_STATE | IEEE80211_MSG_DEBUG,
 		"start running, %d vaps running\n", ic->ic_nrunning);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		/*
 		 * Mark us running.  Note that it's ok to do this first;
 		 * if we need to bring the parent device up we defer that
 		 * to avoid dropping the com lock.  We expect the device
 		 * to respond to being marked up by calling back into us
 		 * through ieee80211_start_all at which point we'll come
 		 * back in here and complete the work.
 		 */
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		ieee80211_notify_ifnet_change(vap, IFF_DRV_RUNNING);
 
 		/*
 		 * We are not running; if this we are the first vap
 		 * to be brought up auto-up the parent if necessary.
 		 */
 		if (ic->ic_nrunning++ == 0) {
 			/* reset the channel to a known good channel */
 			if (ieee80211_start_check_reset_chan(vap))
 				ieee80211_start_reset_chan(vap);
 
 			IEEE80211_DPRINTF(vap,
 			    IEEE80211_MSG_STATE | IEEE80211_MSG_DEBUG,
 			    "%s: up parent %s\n", __func__, ic->ic_name);
 			ieee80211_runtask(ic, &ic->ic_parent_task);
 			return;
 		}
 	}
 	/*
 	 * If the parent is up and running, then kick the
 	 * 802.11 state machine as appropriate.
 	 */
 	if (vap->iv_roaming != IEEE80211_ROAMING_MANUAL) {
 		if (vap->iv_opmode == IEEE80211_M_STA) {
 #if 0
 			/* XXX bypasses scan too easily; disable for now */
 			/*
 			 * Try to be intelligent about clocking the state
 			 * machine.  If we're currently in RUN state then
 			 * we should be able to apply any new state/parameters
 			 * simply by re-associating.  Otherwise we need to
 			 * re-scan to select an appropriate ap.
 			 */ 
 			if (vap->iv_state >= IEEE80211_S_RUN)
 				ieee80211_new_state_locked(vap,
 				    IEEE80211_S_ASSOC, 1);
 			else
 #endif
 				ieee80211_new_state_locked(vap,
 				    IEEE80211_S_SCAN, 0);
 		} else {
 			/*
 			 * For monitor+wds mode there's nothing to do but
 			 * start running.  Otherwise if this is the first
 			 * vap to be brought up, start a scan which may be
 			 * preempted if the station is locked to a particular
 			 * channel.
 			 */
 			vap->iv_flags_ext |= IEEE80211_FEXT_REINIT;
 			if (vap->iv_opmode == IEEE80211_M_MONITOR ||
 			    vap->iv_opmode == IEEE80211_M_WDS)
 				ieee80211_new_state_locked(vap,
 				    IEEE80211_S_RUN, -1);
 			else
 				ieee80211_new_state_locked(vap,
 				    IEEE80211_S_SCAN, 0);
 		}
 	}
 }
 
 /*
  * Start a single vap.
  */
 void
 ieee80211_init(void *arg)
 {
 	struct ieee80211vap *vap = arg;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE | IEEE80211_MSG_DEBUG,
 	    "%s\n", __func__);
 
 	IEEE80211_LOCK(vap->iv_ic);
 	ieee80211_start_locked(vap);
 	IEEE80211_UNLOCK(vap->iv_ic);
 }
 
 /*
  * Start all runnable vap's on a device.
  */
 void
 ieee80211_start_all(struct ieee80211com *ic)
 {
 	struct ieee80211vap *vap;
 
 	IEEE80211_LOCK(ic);
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		struct ifnet *ifp = vap->iv_ifp;
 		if (IFNET_IS_UP_RUNNING(ifp))	/* NB: avoid recursion */
 			ieee80211_start_locked(vap);
 	}
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Stop a vap.  We force it down using the state machine
  * then mark it's ifnet not running.  If this is the last
  * vap running on the underlying device then we close it
  * too to insure it will be properly initialized when the
  * next vap is brought up.
  */
 void
 ieee80211_stop_locked(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE | IEEE80211_MSG_DEBUG,
 	    "stop running, %d vaps running\n", ic->ic_nrunning);
 
 	ieee80211_new_state_locked(vap, IEEE80211_S_INIT, -1);
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;	/* mark us stopped */
 		ieee80211_notify_ifnet_change(vap, IFF_DRV_RUNNING);
 		if (--ic->ic_nrunning == 0) {
 			IEEE80211_DPRINTF(vap,
 			    IEEE80211_MSG_STATE | IEEE80211_MSG_DEBUG,
 			    "down parent %s\n", ic->ic_name);
 			ieee80211_runtask(ic, &ic->ic_parent_task);
 		}
 	}
 }
 
 void
 ieee80211_stop(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK(ic);
 	ieee80211_stop_locked(vap);
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Stop all vap's running on a device.
  */
 void
 ieee80211_stop_all(struct ieee80211com *ic)
 {
 	struct ieee80211vap *vap;
 
 	IEEE80211_LOCK(ic);
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		struct ifnet *ifp = vap->iv_ifp;
 		if (IFNET_IS_UP_RUNNING(ifp))	/* NB: avoid recursion */
 			ieee80211_stop_locked(vap);
 	}
 	IEEE80211_UNLOCK(ic);
 
 	ieee80211_waitfor_parent(ic);
 }
 
 /*
  * Stop all vap's running on a device and arrange
  * for those that were running to be resumed.
  */
 void
 ieee80211_suspend_all(struct ieee80211com *ic)
 {
 	struct ieee80211vap *vap;
 
 	IEEE80211_LOCK(ic);
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		struct ifnet *ifp = vap->iv_ifp;
 		if (IFNET_IS_UP_RUNNING(ifp)) {	/* NB: avoid recursion */
 			vap->iv_flags_ext |= IEEE80211_FEXT_RESUME;
 			ieee80211_stop_locked(vap);
 		}
 	}
 	IEEE80211_UNLOCK(ic);
 
 	ieee80211_waitfor_parent(ic);
 }
 
 /*
  * Start all vap's marked for resume.
  */
 void
 ieee80211_resume_all(struct ieee80211com *ic)
 {
 	struct ieee80211vap *vap;
 
 	IEEE80211_LOCK(ic);
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		struct ifnet *ifp = vap->iv_ifp;
 		if (!IFNET_IS_UP_RUNNING(ifp) &&
 		    (vap->iv_flags_ext & IEEE80211_FEXT_RESUME)) {
 			vap->iv_flags_ext &= ~IEEE80211_FEXT_RESUME;
 			ieee80211_start_locked(vap);
 		}
 	}
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Restart all vap's running on a device.
  */
 void
 ieee80211_restart_all(struct ieee80211com *ic)
 {
 	/*
 	 * NB: do not use ieee80211_runtask here, we will
 	 * block & drain net80211 taskqueue.
 	 */
 	taskqueue_enqueue(taskqueue_thread, &ic->ic_restart_task);
 }
 
 void
 ieee80211_beacon_miss(struct ieee80211com *ic)
 {
 	IEEE80211_LOCK(ic);
 	if ((ic->ic_flags & IEEE80211_F_SCAN) == 0) {
 		/* Process in a taskq, the handler may reenter the driver */
 		ieee80211_runtask(ic, &ic->ic_bmiss_task);
 	}
 	IEEE80211_UNLOCK(ic);
 }
 
 static void
 beacon_miss(void *arg, int npending)
 {
 	struct ieee80211com *ic = arg;
 	struct ieee80211vap *vap;
 
 	IEEE80211_LOCK(ic);
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		/*
 		 * We only pass events through for sta vap's in RUN+ state;
 		 * may be too restrictive but for now this saves all the
 		 * handlers duplicating these checks.
 		 */
 		if (vap->iv_opmode == IEEE80211_M_STA &&
 		    vap->iv_state >= IEEE80211_S_RUN &&
 		    vap->iv_bmiss != NULL)
 			vap->iv_bmiss(vap);
 	}
 	IEEE80211_UNLOCK(ic);
 }
 
 static void
 beacon_swmiss(void *arg, int npending)
 {
 	struct ieee80211vap *vap = arg;
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK(ic);
 	if (vap->iv_state >= IEEE80211_S_RUN) {
 		/* XXX Call multiple times if npending > zero? */
 		vap->iv_bmiss(vap);
 	}
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Software beacon miss handling.  Check if any beacons
  * were received in the last period.  If not post a
  * beacon miss; otherwise reset the counter.
  */
 void
 ieee80211_swbmiss(void *arg)
 {
 	struct ieee80211vap *vap = arg;
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	KASSERT(vap->iv_state >= IEEE80211_S_RUN,
 	    ("wrong state %d", vap->iv_state));
 
 	if (ic->ic_flags & IEEE80211_F_SCAN) {
 		/*
 		 * If scanning just ignore and reset state.  If we get a
 		 * bmiss after coming out of scan because we haven't had
 		 * time to receive a beacon then we should probe the AP
 		 * before posting a real bmiss (unless iv_bmiss_max has
 		 * been artifiically lowered).  A cleaner solution might
 		 * be to disable the timer on scan start/end but to handle
 		 * case of multiple sta vap's we'd need to disable the
 		 * timers of all affected vap's.
 		 */
 		vap->iv_swbmiss_count = 0;
 	} else if (vap->iv_swbmiss_count == 0) {
 		if (vap->iv_bmiss != NULL)
 			ieee80211_runtask(ic, &vap->iv_swbmiss_task);
 	} else
 		vap->iv_swbmiss_count = 0;
 	callout_reset(&vap->iv_swbmiss, vap->iv_swbmiss_period,
 		ieee80211_swbmiss, vap);
 }
 
 /*
  * Start an 802.11h channel switch.  We record the parameters,
  * mark the operation pending, notify each vap through the
  * beacon update mechanism so it can update the beacon frame
  * contents, and then switch vap's to CSA state to block outbound
  * traffic.  Devices that handle CSA directly can use the state
  * switch to do the right thing so long as they call
  * ieee80211_csa_completeswitch when it's time to complete the
  * channel change.  Devices that depend on the net80211 layer can
  * use ieee80211_beacon_update to handle the countdown and the
  * channel switch.
  */
 void
 ieee80211_csa_startswitch(struct ieee80211com *ic,
 	struct ieee80211_channel *c, int mode, int count)
 {
 	struct ieee80211vap *vap;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	ic->ic_csa_newchan = c;
 	ic->ic_csa_mode = mode;
 	ic->ic_csa_count = count;
 	ic->ic_flags |= IEEE80211_F_CSAPENDING;
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		if (vap->iv_opmode == IEEE80211_M_HOSTAP ||
 		    vap->iv_opmode == IEEE80211_M_IBSS ||
 		    vap->iv_opmode == IEEE80211_M_MBSS)
 			ieee80211_beacon_notify(vap, IEEE80211_BEACON_CSA);
 		/* switch to CSA state to block outbound traffic */
 		if (vap->iv_state == IEEE80211_S_RUN)
 			ieee80211_new_state_locked(vap, IEEE80211_S_CSA, 0);
 	}
 	ieee80211_notify_csa(ic, c, mode, count);
 }
 
 /*
  * Complete the channel switch by transitioning all CSA VAPs to RUN.
  * This is called by both the completion and cancellation functions
  * so each VAP is placed back in the RUN state and can thus transmit.
  */
 static void
 csa_completeswitch(struct ieee80211com *ic)
 {
 	struct ieee80211vap *vap;
 
 	ic->ic_csa_newchan = NULL;
 	ic->ic_flags &= ~IEEE80211_F_CSAPENDING;
 
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
 		if (vap->iv_state == IEEE80211_S_CSA)
 			ieee80211_new_state_locked(vap, IEEE80211_S_RUN, 0);
 }
 
 /*
  * Complete an 802.11h channel switch started by ieee80211_csa_startswitch.
  * We clear state and move all vap's in CSA state to RUN state
  * so they can again transmit.
  *
  * Although this may not be completely correct, update the BSS channel
  * for each VAP to the newly configured channel. The setcurchan sets
  * the current operating channel for the interface (so the radio does
  * switch over) but the VAP BSS isn't updated, leading to incorrectly
  * reported information via ioctl.
  */
 void
 ieee80211_csa_completeswitch(struct ieee80211com *ic)
 {
 	struct ieee80211vap *vap;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	KASSERT(ic->ic_flags & IEEE80211_F_CSAPENDING, ("csa not pending"));
 
 	ieee80211_setcurchan(ic, ic->ic_csa_newchan);
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
 		if (vap->iv_state == IEEE80211_S_CSA)
 			vap->iv_bss->ni_chan = ic->ic_curchan;
 
 	csa_completeswitch(ic);
 }
 
 /*
  * Cancel an 802.11h channel switch started by ieee80211_csa_startswitch.
  * We clear state and move all vap's in CSA state to RUN state
  * so they can again transmit.
  */
 void
 ieee80211_csa_cancelswitch(struct ieee80211com *ic)
 {
 	IEEE80211_LOCK_ASSERT(ic);
 
 	csa_completeswitch(ic);
 }
 
 /*
  * Complete a DFS CAC started by ieee80211_dfs_cac_start.
  * We clear state and move all vap's in CAC state to RUN state.
  */
 void
 ieee80211_cac_completeswitch(struct ieee80211vap *vap0)
 {
 	struct ieee80211com *ic = vap0->iv_ic;
 	struct ieee80211vap *vap;
 
 	IEEE80211_LOCK(ic);
 	/*
 	 * Complete CAC state change for lead vap first; then
 	 * clock all the other vap's waiting.
 	 */
 	KASSERT(vap0->iv_state == IEEE80211_S_CAC,
 	    ("wrong state %d", vap0->iv_state));
 	ieee80211_new_state_locked(vap0, IEEE80211_S_RUN, 0);
 
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next)
 		if (vap->iv_state == IEEE80211_S_CAC && vap != vap0)
 			ieee80211_new_state_locked(vap, IEEE80211_S_RUN, 0);
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Force all vap's other than the specified vap to the INIT state
  * and mark them as waiting for a scan to complete.  These vaps
  * will be brought up when the scan completes and the scanning vap
  * reaches RUN state by wakeupwaiting.
  */
 static void
 markwaiting(struct ieee80211vap *vap0)
 {
 	struct ieee80211com *ic = vap0->iv_ic;
 	struct ieee80211vap *vap;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	/*
 	 * A vap list entry can not disappear since we are running on the
 	 * taskqueue and a vap destroy will queue and drain another state
 	 * change task.
 	 */
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		if (vap == vap0)
 			continue;
 		if (vap->iv_state != IEEE80211_S_INIT) {
 			/* NB: iv_newstate may drop the lock */
 			vap->iv_newstate(vap, IEEE80211_S_INIT, 0);
 			IEEE80211_LOCK_ASSERT(ic);
 			vap->iv_flags_ext |= IEEE80211_FEXT_SCANWAIT;
 		}
 	}
 }
 
 /*
  * Wakeup all vap's waiting for a scan to complete.  This is the
  * companion to markwaiting (above) and is used to coordinate
  * multiple vaps scanning.
  * This is called from the state taskqueue.
  */
 static void
 wakeupwaiting(struct ieee80211vap *vap0)
 {
 	struct ieee80211com *ic = vap0->iv_ic;
 	struct ieee80211vap *vap;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	/*
 	 * A vap list entry can not disappear since we are running on the
 	 * taskqueue and a vap destroy will queue and drain another state
 	 * change task.
 	 */
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		if (vap == vap0)
 			continue;
 		if (vap->iv_flags_ext & IEEE80211_FEXT_SCANWAIT) {
 			vap->iv_flags_ext &= ~IEEE80211_FEXT_SCANWAIT;
 			/* NB: sta's cannot go INIT->RUN */
 			/* NB: iv_newstate may drop the lock */
 
 			/*
 			 * This is problematic if the interface has OACTIVE
 			 * set.  Only the deferred ieee80211_newstate_cb()
 			 * will end up actually /clearing/ the OACTIVE
 			 * flag on a state transition to RUN from a non-RUN
 			 * state.
 			 *
 			 * But, we're not actually deferring this callback;
 			 * and when the deferred call occurs it shows up as
 			 * a RUN->RUN transition!  So the flag isn't/wasn't
 			 * cleared!
 			 *
 			 * I'm also not sure if it's correct to actually
 			 * do the transitions here fully through the deferred
 			 * paths either as other things can be invoked as
 			 * part of that state machine.
 			 *
 			 * So just keep this in mind when looking at what
 			 * the markwaiting/wakeupwaiting routines are doing
 			 * and how they invoke vap state changes.
 			 */
 
 			vap->iv_newstate(vap,
 			    vap->iv_opmode == IEEE80211_M_STA ?
 			        IEEE80211_S_SCAN : IEEE80211_S_RUN, 0);
 			IEEE80211_LOCK_ASSERT(ic);
 		}
 	}
 }
 
 /*
  * Handle post state change work common to all operating modes.
  */
 static void
 ieee80211_newstate_cb(void *xvap, int npending)
 {
 	struct ieee80211vap *vap = xvap;
 	struct ieee80211com *ic = vap->iv_ic;
 	enum ieee80211_state nstate, ostate;
 	int arg, rc;
 
 	IEEE80211_LOCK(ic);
 	nstate = vap->iv_nstate;
 	arg = vap->iv_nstate_arg;
 
 	if (vap->iv_flags_ext & IEEE80211_FEXT_REINIT) {
 		/*
 		 * We have been requested to drop back to the INIT before
 		 * proceeding to the new state.
 		 */
 		/* Deny any state changes while we are here. */
 		vap->iv_nstate = IEEE80211_S_INIT;
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 		    "%s: %s -> %s arg %d\n", __func__,
 		    ieee80211_state_name[vap->iv_state],
 		    ieee80211_state_name[vap->iv_nstate], arg);
 		vap->iv_newstate(vap, vap->iv_nstate, 0);
 		IEEE80211_LOCK_ASSERT(ic);
 		vap->iv_flags_ext &= ~(IEEE80211_FEXT_REINIT |
 		    IEEE80211_FEXT_STATEWAIT);
 		/* enqueue new state transition after cancel_scan() task */
 		ieee80211_new_state_locked(vap, nstate, arg);
 		goto done;
 	}
 
 	ostate = vap->iv_state;
 	if (nstate == IEEE80211_S_SCAN && ostate != IEEE80211_S_INIT) {
 		/*
 		 * SCAN was forced; e.g. on beacon miss.  Force other running
 		 * vap's to INIT state and mark them as waiting for the scan to
 		 * complete.  This insures they don't interfere with our
 		 * scanning.  Since we are single threaded the vaps can not
 		 * transition again while we are executing.
 		 *
 		 * XXX not always right, assumes ap follows sta
 		 */
 		markwaiting(vap);
 	}
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 	    "%s: %s -> %s arg %d\n", __func__,
 	    ieee80211_state_name[ostate], ieee80211_state_name[nstate], arg);
 
 	rc = vap->iv_newstate(vap, nstate, arg);
 	IEEE80211_LOCK_ASSERT(ic);
 	vap->iv_flags_ext &= ~IEEE80211_FEXT_STATEWAIT;
 	if (rc != 0) {
 		/* State transition failed */
 		KASSERT(rc != EINPROGRESS, ("iv_newstate was deferred"));
 		KASSERT(nstate != IEEE80211_S_INIT,
 		    ("INIT state change failed"));
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 		    "%s: %s returned error %d\n", __func__,
 		    ieee80211_state_name[nstate], rc);
 		goto done;
 	}
 
 	/*
 	 * Handle the case of a RUN->RUN transition occuring when STA + AP
 	 * VAPs occur on the same radio.
 	 *
 	 * The mark and wakeup waiting routines call iv_newstate() directly,
 	 * but they do not end up deferring state changes here.
 	 * Thus, although the VAP newstate method sees a transition
 	 * of RUN->INIT->RUN, the deferred path here only sees a RUN->RUN
 	 * transition.  If OACTIVE is set then it is never cleared.
 	 *
 	 * So, if we're here and the state is RUN, just clear OACTIVE.
 	 * At some point if the markwaiting/wakeupwaiting paths end up
 	 * also invoking the deferred state updates then this will
 	 * be no-op code - and also if OACTIVE is finally retired, it'll
 	 * also be no-op code.
 	 */
 	if (nstate == IEEE80211_S_RUN) {
 		/*
 		 * OACTIVE may be set on the vap if the upper layer
 		 * tried to transmit (e.g. IPv6 NDP) before we reach
 		 * RUN state.  Clear it and restart xmit.
 		 *
 		 * Note this can also happen as a result of SLEEP->RUN
 		 * (i.e. coming out of power save mode).
 		 *
 		 * Historically this was done only for a state change
 		 * but is needed earlier; see next comment.  The 2nd half
 		 * of the work is still only done in case of an actual
 		 * state change below.
 		 */
 		/*
 		 * Unblock the VAP queue; a RUN->RUN state can happen
 		 * on a STA+AP setup on the AP vap.  See wakeupwaiting().
 		 */
 		vap->iv_ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 
 		/*
 		 * XXX TODO Kick-start a VAP queue - this should be a method!
 		 */
 	}
 
 	/* No actual transition, skip post processing */
 	if (ostate == nstate)
 		goto done;
 
 	if (nstate == IEEE80211_S_RUN) {
 
 		/* bring up any vaps waiting on us */
 		wakeupwaiting(vap);
 	} else if (nstate == IEEE80211_S_INIT) {
 		/*
 		 * Flush the scan cache if we did the last scan (XXX?)
 		 * and flush any frames on send queues from this vap.
 		 * Note the mgt q is used only for legacy drivers and
 		 * will go away shortly.
 		 */
 		ieee80211_scan_flush(vap);
 
 		/*
 		 * XXX TODO: ic/vap queue flush
 		 */
 	}
 done:
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Public interface for initiating a state machine change.
  * This routine single-threads the request and coordinates
  * the scheduling of multiple vaps for the purpose of selecting
  * an operating channel.  Specifically the following scenarios
  * are handled:
  * o only one vap can be selecting a channel so on transition to
  *   SCAN state if another vap is already scanning then
  *   mark the caller for later processing and return without
  *   doing anything (XXX? expectations by caller of synchronous operation)
  * o only one vap can be doing CAC of a channel so on transition to
  *   CAC state if another vap is already scanning for radar then
  *   mark the caller for later processing and return without
  *   doing anything (XXX? expectations by caller of synchronous operation)
  * o if another vap is already running when a request is made
  *   to SCAN then an operating channel has been chosen; bypass
  *   the scan and just join the channel
  *
  * Note that the state change call is done through the iv_newstate
  * method pointer so any driver routine gets invoked.  The driver
  * will normally call back into operating mode-specific
  * ieee80211_newstate routines (below) unless it needs to completely
  * bypass the state machine (e.g. because the firmware has it's
  * own idea how things should work).  Bypassing the net80211 layer
  * is usually a mistake and indicates lack of proper integration
  * with the net80211 layer.
  */
 int
 ieee80211_new_state_locked(struct ieee80211vap *vap,
 	enum ieee80211_state nstate, int arg)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211vap *vp;
 	enum ieee80211_state ostate;
 	int nrunning, nscanning;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	if (vap->iv_flags_ext & IEEE80211_FEXT_STATEWAIT) {
 		if (vap->iv_nstate == IEEE80211_S_INIT ||
 		    ((vap->iv_state == IEEE80211_S_INIT ||
 		    (vap->iv_flags_ext & IEEE80211_FEXT_REINIT)) &&
 		    vap->iv_nstate == IEEE80211_S_SCAN &&
 		    nstate > IEEE80211_S_SCAN)) {
 			/*
 			 * XXX The vap is being stopped/started,
 			 * do not allow any other state changes
 			 * until this is completed.
 			 */
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 			    "%s: %s -> %s (%s) transition discarded\n",
 			    __func__,
 			    ieee80211_state_name[vap->iv_state],
 			    ieee80211_state_name[nstate],
 			    ieee80211_state_name[vap->iv_nstate]);
 			return -1;
 		} else if (vap->iv_state != vap->iv_nstate) {
 #if 0
 			/* Warn if the previous state hasn't completed. */
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 			    "%s: pending %s -> %s transition lost\n", __func__,
 			    ieee80211_state_name[vap->iv_state],
 			    ieee80211_state_name[vap->iv_nstate]);
 #else
 			/* XXX temporarily enable to identify issues */
 			if_printf(vap->iv_ifp,
 			    "%s: pending %s -> %s transition lost\n",
 			    __func__, ieee80211_state_name[vap->iv_state],
 			    ieee80211_state_name[vap->iv_nstate]);
 #endif
 		}
 	}
 
 	nrunning = nscanning = 0;
 	/* XXX can track this state instead of calculating */
 	TAILQ_FOREACH(vp, &ic->ic_vaps, iv_next) {
 		if (vp != vap) {
 			if (vp->iv_state >= IEEE80211_S_RUN)
 				nrunning++;
 			/* XXX doesn't handle bg scan */
 			/* NB: CAC+AUTH+ASSOC treated like SCAN */
 			else if (vp->iv_state > IEEE80211_S_INIT)
 				nscanning++;
 		}
 	}
 	ostate = vap->iv_state;
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 	    "%s: %s -> %s (arg %d) (nrunning %d nscanning %d)\n", __func__,
 	    ieee80211_state_name[ostate], ieee80211_state_name[nstate], arg,
 	    nrunning, nscanning);
 	switch (nstate) {
 	case IEEE80211_S_SCAN:
 		if (ostate == IEEE80211_S_INIT) {
 			/*
 			 * INIT -> SCAN happens on initial bringup.
 			 */
 			KASSERT(!(nscanning && nrunning),
 			    ("%d scanning and %d running", nscanning, nrunning));
 			if (nscanning) {
 				/*
 				 * Someone is scanning, defer our state
 				 * change until the work has completed.
 				 */
 				IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 				    "%s: defer %s -> %s\n",
 				    __func__, ieee80211_state_name[ostate],
 				    ieee80211_state_name[nstate]);
 				vap->iv_flags_ext |= IEEE80211_FEXT_SCANWAIT;
 				return 0;
 			}
 			if (nrunning) {
 				/*
 				 * Someone is operating; just join the channel
 				 * they have chosen.
 				 */
 				/* XXX kill arg? */
 				/* XXX check each opmode, adhoc? */
 				if (vap->iv_opmode == IEEE80211_M_STA)
 					nstate = IEEE80211_S_SCAN;
 				else
 					nstate = IEEE80211_S_RUN;
 #ifdef IEEE80211_DEBUG
 				if (nstate != IEEE80211_S_SCAN) {
 					IEEE80211_DPRINTF(vap,
 					    IEEE80211_MSG_STATE,
 					    "%s: override, now %s -> %s\n",
 					    __func__,
 					    ieee80211_state_name[ostate],
 					    ieee80211_state_name[nstate]);
 				}
 #endif
 			}
 		}
 		break;
 	case IEEE80211_S_RUN:
 		if (vap->iv_opmode == IEEE80211_M_WDS &&
 		    (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY) &&
 		    nscanning) {
 			/*
 			 * Legacy WDS with someone else scanning; don't
 			 * go online until that completes as we should
 			 * follow the other vap to the channel they choose.
 			 */
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 			     "%s: defer %s -> %s (legacy WDS)\n", __func__,
 			     ieee80211_state_name[ostate],
 			     ieee80211_state_name[nstate]);
 			vap->iv_flags_ext |= IEEE80211_FEXT_SCANWAIT;
 			return 0;
 		}
 		if (vap->iv_opmode == IEEE80211_M_HOSTAP &&
 		    IEEE80211_IS_CHAN_DFS(ic->ic_bsschan) &&
 		    (vap->iv_flags_ext & IEEE80211_FEXT_DFS) &&
 		    !IEEE80211_IS_CHAN_CACDONE(ic->ic_bsschan)) {
 			/*
 			 * This is a DFS channel, transition to CAC state
 			 * instead of RUN.  This allows us to initiate
 			 * Channel Availability Check (CAC) as specified
 			 * by 11h/DFS.
 			 */
 			nstate = IEEE80211_S_CAC;
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 			     "%s: override %s -> %s (DFS)\n", __func__,
 			     ieee80211_state_name[ostate],
 			     ieee80211_state_name[nstate]);
 		}
 		break;
 	case IEEE80211_S_INIT:
 		/* cancel any scan in progress */
 		ieee80211_cancel_scan(vap);
 		if (ostate == IEEE80211_S_INIT ) {
 			/* XXX don't believe this */
 			/* INIT -> INIT. nothing to do */
 			vap->iv_flags_ext &= ~IEEE80211_FEXT_SCANWAIT;
 		}
 		/* fall thru... */
 	default:
 		break;
 	}
 	/* defer the state change to a thread */
 	vap->iv_nstate = nstate;
 	vap->iv_nstate_arg = arg;
 	vap->iv_flags_ext |= IEEE80211_FEXT_STATEWAIT;
 	ieee80211_runtask(ic, &vap->iv_nstate_task);
 	return EINPROGRESS;
 }
 
 int
 ieee80211_new_state(struct ieee80211vap *vap,
 	enum ieee80211_state nstate, int arg)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	int rc;
 
 	IEEE80211_LOCK(ic);
 	rc = ieee80211_new_state_locked(vap, nstate, arg);
 	IEEE80211_UNLOCK(ic);
 	return rc;
 }
diff --git a/sys/net80211/ieee80211_scan.c b/sys/net80211/ieee80211_scan.c
index 50b3151063fa..fd387e68e39f 100644
--- a/sys/net80211/ieee80211_scan.c
+++ b/sys/net80211/ieee80211_scan.c
@@ -1,687 +1,688 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2008 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * IEEE 802.11 scanning support.
  */
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/condvar.h>
 
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
+#include <net/if_private.h>
 #include <net/ethernet.h>
 
 #include <net80211/ieee80211_var.h>
 
 /* XXX until it's implemented as attach ops */
 #include <net80211/ieee80211_scan_sw.h>
 
 #include <net/bpf.h>
 
 /*
  * Roaming-related defaults.  RSSI thresholds are as returned by the
  * driver (.5dBm).  Transmit rate thresholds are IEEE rate codes (i.e
  * .5M units) or MCS.
  */
 /* rssi thresholds */
 #define	ROAM_RSSI_11A_DEFAULT		14	/* 11a bss */
 #define	ROAM_RSSI_11B_DEFAULT		14	/* 11b bss */
 #define	ROAM_RSSI_11BONLY_DEFAULT	14	/* 11b-only bss */
 /* transmit rate thresholds */
 #define	ROAM_RATE_11A_DEFAULT		2*12	/* 11a bss */
 #define	ROAM_RATE_11B_DEFAULT		2*5	/* 11b bss */
 #define	ROAM_RATE_11BONLY_DEFAULT	2*1	/* 11b-only bss */
 #define	ROAM_RATE_HALF_DEFAULT		2*6	/* half-width 11a/g bss */
 #define	ROAM_RATE_QUARTER_DEFAULT	2*3	/* quarter-width 11a/g bss */
 #define	ROAM_MCS_11N_DEFAULT		(1 | IEEE80211_RATE_MCS) /* 11n bss */
 #define	ROAM_MCS_11AC_DEFAULT		(1 | IEEE80211_RATE_MCS) /* 11ac bss; XXX not used yet */
 
 void
 ieee80211_scan_attach(struct ieee80211com *ic)
 {
 	/*
 	 * If there's no scan method pointer, attach the
 	 * swscan set as a default.
 	 */
 	if (ic->ic_scan_methods == NULL)
 		ieee80211_swscan_attach(ic);
 	else
 		ic->ic_scan_methods->sc_attach(ic);
 }
 
 void
 ieee80211_scan_detach(struct ieee80211com *ic)
 {
 
 	/*
 	 * Ideally we'd do the ss_ops detach call here;
 	 * but then sc_detach() would need to be split in two.
 	 *
 	 * I'll do that later.
 	 */
 	ic->ic_scan_methods->sc_detach(ic);
 }
 
 static const struct ieee80211_roamparam defroam[IEEE80211_MODE_MAX] = {
 	[IEEE80211_MODE_11A]	= { .rssi = ROAM_RSSI_11A_DEFAULT,
 				    .rate = ROAM_RATE_11A_DEFAULT },
 	[IEEE80211_MODE_11G]	= { .rssi = ROAM_RSSI_11B_DEFAULT,
 				    .rate = ROAM_RATE_11B_DEFAULT },
 	[IEEE80211_MODE_11B]	= { .rssi = ROAM_RSSI_11BONLY_DEFAULT,
 				    .rate = ROAM_RATE_11BONLY_DEFAULT },
 	[IEEE80211_MODE_TURBO_A]= { .rssi = ROAM_RSSI_11A_DEFAULT,
 				    .rate = ROAM_RATE_11A_DEFAULT },
 	[IEEE80211_MODE_TURBO_G]= { .rssi = ROAM_RSSI_11A_DEFAULT,
 				    .rate = ROAM_RATE_11A_DEFAULT },
 	[IEEE80211_MODE_STURBO_A]={ .rssi = ROAM_RSSI_11A_DEFAULT,
 				    .rate = ROAM_RATE_11A_DEFAULT },
 	[IEEE80211_MODE_HALF]	= { .rssi = ROAM_RSSI_11A_DEFAULT,
 				    .rate = ROAM_RATE_HALF_DEFAULT },
 	[IEEE80211_MODE_QUARTER]= { .rssi = ROAM_RSSI_11A_DEFAULT,
 				    .rate = ROAM_RATE_QUARTER_DEFAULT },
 	[IEEE80211_MODE_11NA]	= { .rssi = ROAM_RSSI_11A_DEFAULT,
 				    .rate = ROAM_MCS_11N_DEFAULT },
 	[IEEE80211_MODE_11NG]	= { .rssi = ROAM_RSSI_11B_DEFAULT,
 				    .rate = ROAM_MCS_11N_DEFAULT },
 	[IEEE80211_MODE_VHT_2GHZ]	= { .rssi = ROAM_RSSI_11B_DEFAULT,
 				    .rate = ROAM_MCS_11AC_DEFAULT },
 	[IEEE80211_MODE_VHT_5GHZ]	= { .rssi = ROAM_RSSI_11A_DEFAULT,
 				    .rate = ROAM_MCS_11AC_DEFAULT },
 
 };
 
 void
 ieee80211_scan_vattach(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	int m;
 
 	vap->iv_bgscanidle = (IEEE80211_BGSCAN_IDLE_DEFAULT*1000)/hz;
 	vap->iv_bgscanintvl = IEEE80211_BGSCAN_INTVAL_DEFAULT*hz;
 	vap->iv_scanvalid = IEEE80211_SCAN_VALID_DEFAULT*hz;
 
 	vap->iv_roaming = IEEE80211_ROAMING_AUTO;
 
 	memset(vap->iv_roamparms, 0, sizeof(vap->iv_roamparms));
 	for (m = IEEE80211_MODE_AUTO + 1; m < IEEE80211_MODE_MAX; m++) {
 		if (isclr(ic->ic_modecaps, m))
 			continue;
 
 		memcpy(&vap->iv_roamparms[m], &defroam[m], sizeof(defroam[m]));
 	}
 
 	ic->ic_scan_methods->sc_vattach(vap);
 }
 
 void
 ieee80211_scan_vdetach(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss;
 
 	IEEE80211_LOCK(ic);
 	ss = ic->ic_scan;
 
 	ic->ic_scan_methods->sc_vdetach(vap);
 
 	if (ss != NULL && ss->ss_vap == vap) {
 		if (ss->ss_ops != NULL) {
 			ss->ss_ops->scan_detach(ss);
 			ss->ss_ops = NULL;
 		}
 		ss->ss_vap = NULL;
 	}
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Simple-minded scanner module support.
  */
 static const char *scan_modnames[IEEE80211_OPMODE_MAX] = {
 	"wlan_scan_sta",	/* IEEE80211_M_IBSS */
 	"wlan_scan_sta",	/* IEEE80211_M_STA */
 	"wlan_scan_wds",	/* IEEE80211_M_WDS */
 	"wlan_scan_sta",	/* IEEE80211_M_AHDEMO */
 	"wlan_scan_ap",		/* IEEE80211_M_HOSTAP */
 	"wlan_scan_monitor",	/* IEEE80211_M_MONITOR */
 	"wlan_scan_sta",	/* IEEE80211_M_MBSS */
 };
 static const struct ieee80211_scanner *scanners[IEEE80211_OPMODE_MAX];
 
 const struct ieee80211_scanner *
 ieee80211_scanner_get(enum ieee80211_opmode mode)
 {
 	if (mode >= IEEE80211_OPMODE_MAX)
 		return NULL;
 	if (scanners[mode] == NULL)
 		ieee80211_load_module(scan_modnames[mode]);
 	return scanners[mode];
 }
 
 void
 ieee80211_scanner_register(enum ieee80211_opmode mode,
 	const struct ieee80211_scanner *scan)
 {
 	if (mode >= IEEE80211_OPMODE_MAX)
 		return;
 	scanners[mode] = scan;
 }
 
 void
 ieee80211_scanner_unregister(enum ieee80211_opmode mode,
 	const struct ieee80211_scanner *scan)
 {
 	if (mode >= IEEE80211_OPMODE_MAX)
 		return;
 	if (scanners[mode] == scan)
 		scanners[mode] = NULL;
 }
 
 void
 ieee80211_scanner_unregister_all(const struct ieee80211_scanner *scan)
 {
 	int m;
 
 	for (m = 0; m < IEEE80211_OPMODE_MAX; m++)
 		if (scanners[m] == scan)
 			scanners[m] = NULL;
 }
 
 /*
  * Update common scanner state to reflect the current
  * operating mode.  This is called when the state machine
  * is transitioned to RUN state w/o scanning--e.g. when
  * operating in monitor mode.  The purpose of this is to
  * ensure later callbacks find ss_ops set to properly
  * reflect current operating mode.
  */
 void
 ieee80211_scan_update_locked(struct ieee80211vap *vap,
 	const struct ieee80211_scanner *scan)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 #ifdef IEEE80211_DEBUG
 	if (ss->ss_vap != vap || ss->ss_ops != scan) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: current scanner is <%s:%s>, switch to <%s:%s>\n",
 		    __func__,
 		    ss->ss_vap != NULL ?
 			ss->ss_vap->iv_ifp->if_xname : "none",
 		    ss->ss_vap != NULL ?
 			ieee80211_opmode_name[ss->ss_vap->iv_opmode] : "none",
 		    vap->iv_ifp->if_xname,
 		    ieee80211_opmode_name[vap->iv_opmode]);
 	}
 #endif
 	ss->ss_vap = vap;
 	if (ss->ss_ops != scan) {
 		/*
 		 * Switch scanners; detach old, attach new.  Special
 		 * case where a single scan module implements multiple
 		 * policies by using different scan ops but a common
 		 * core.  We assume if the old and new attach methods
 		 * are identical then it's ok to just change ss_ops
 		 * and not flush the internal state of the module.
 		 */
 		if (scan == NULL || ss->ss_ops == NULL ||
 		    ss->ss_ops->scan_attach != scan->scan_attach) {
 			if (ss->ss_ops != NULL)
 				ss->ss_ops->scan_detach(ss);
 			if (scan != NULL && !scan->scan_attach(ss)) {
 				/* XXX attach failure */
 				/* XXX stat+msg */
 				scan = NULL;
 			}
 		}
 		ss->ss_ops = scan;
 	}
 }
 
 void
 ieee80211_scan_dump_channels(const struct ieee80211_scan_state *ss)
 {
 	struct ieee80211com *ic = ss->ss_ic;
 	const char *sep;
 	int i;
 
 	sep = "";
 	for (i = ss->ss_next; i < ss->ss_last; i++) {
 		const struct ieee80211_channel *c = ss->ss_chans[i];
 
 		printf("%s%u%c", sep, ieee80211_chan2ieee(ic, c),
 		    ieee80211_channel_type_char(c));
 		sep = ", ";
 	}
 }
 
 #ifdef IEEE80211_DEBUG
 void
 ieee80211_scan_dump(struct ieee80211_scan_state *ss)
 {
 	struct ieee80211vap *vap = ss->ss_vap;
 
 	if_printf(vap->iv_ifp, "scan set ");
 	ieee80211_scan_dump_channels(ss);
 	printf(" dwell min %ums max %ums\n",
 	    ticks_to_msecs(ss->ss_mindwell), ticks_to_msecs(ss->ss_maxdwell));
 }
 #endif /* IEEE80211_DEBUG */
 
 void
 ieee80211_scan_copy_ssid(struct ieee80211vap *vap, struct ieee80211_scan_state *ss,
 	int nssid, const struct ieee80211_scan_ssid ssids[])
 {
 	if (nssid > IEEE80211_SCAN_MAX_SSID) {
 		/* XXX printf */
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: too many ssid %d, ignoring all of them\n",
 		    __func__, nssid);
 		return;
 	}
 	memcpy(ss->ss_ssid, ssids, nssid * sizeof(ssids[0]));
 	ss->ss_nssid = nssid;
 }
 
 /*
  * Start a scan unless one is already going.
  */
 int
 ieee80211_start_scan(struct ieee80211vap *vap, int flags,
 	u_int duration, u_int mindwell, u_int maxdwell,
 	u_int nssid, const struct ieee80211_scan_ssid ssids[])
 {
 	const struct ieee80211_scanner *scan;
 	struct ieee80211com *ic = vap->iv_ic;
 
 	scan = ieee80211_scanner_get(vap->iv_opmode);
 	if (scan == NULL) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: no scanner support for %s mode\n",
 		    __func__, ieee80211_opmode_name[vap->iv_opmode]);
 		/* XXX stat */
 		return 0;
 	}
 
 	return ic->ic_scan_methods->sc_start_scan(scan, vap, flags, duration,
 	    mindwell, maxdwell, nssid, ssids);
 }
 
 /*
  * Check the scan cache for an ap/channel to use; if that
  * fails then kick off a new scan.
  */
 int
 ieee80211_check_scan(struct ieee80211vap *vap, int flags,
 	u_int duration, u_int mindwell, u_int maxdwell,
 	u_int nssid, const struct ieee80211_scan_ssid ssids[])
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 	const struct ieee80211_scanner *scan;
 	int result;
 
 	scan = ieee80211_scanner_get(vap->iv_opmode);
 	if (scan == NULL) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: no scanner support for %s mode\n",
 		    __func__, vap->iv_opmode);
 		/* XXX stat */
 		return 0;
 	}
 
 	/*
 	 * Check if there's a list of scan candidates already.
 	 * XXX want more than the ap we're currently associated with
 	 */
 
 	IEEE80211_LOCK(ic);
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 	    "%s: %s scan, %s%s%s%s%s\n"
 	    , __func__
 	    , flags & IEEE80211_SCAN_ACTIVE ? "active" : "passive"
 	    , flags & IEEE80211_SCAN_FLUSH ? "flush" : "append"
 	    , flags & IEEE80211_SCAN_NOPICK ? ", nopick" : ""
 	    , flags & IEEE80211_SCAN_NOJOIN ? ", nojoin" : ""
 	    , flags & IEEE80211_SCAN_PICK1ST ? ", pick1st" : ""
 	    , flags & IEEE80211_SCAN_ONCE ? ", once" : ""
 	);
 
 	if (ss->ss_ops != scan) {
 		/* XXX re-use cache contents? e.g. adhoc<->sta */
 		flags |= IEEE80211_SCAN_FLUSH;
 	}
 
 	/*
 	 * XXX TODO: separate things out a bit better.
 	 */
 	ieee80211_scan_update_locked(vap, scan);
 
 	result = ic->ic_scan_methods->sc_check_scan(scan, vap, flags, duration,
 	    mindwell, maxdwell, nssid, ssids);
 
 	IEEE80211_UNLOCK(ic);
 
 	return (result);
 }
 
 /*
  * Check the scan cache for an ap/channel to use; if that fails
  * then kick off a scan using the current settings.
  */
 int
 ieee80211_check_scan_current(struct ieee80211vap *vap)
 {
 	return ieee80211_check_scan(vap,
 	    IEEE80211_SCAN_ACTIVE,
 	    IEEE80211_SCAN_FOREVER, 0, 0,
 	    vap->iv_des_nssid, vap->iv_des_ssid);
 }
 
 /*
  * Restart a previous scan.  If the previous scan completed
  * then we start again using the existing channel list.
  */
 int
 ieee80211_bg_scan(struct ieee80211vap *vap, int flags)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	const struct ieee80211_scanner *scan;
 
 	// IEEE80211_UNLOCK_ASSERT(sc);
 
 	scan = ieee80211_scanner_get(vap->iv_opmode);
 	if (scan == NULL) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: no scanner support for %s mode\n",
 		    __func__, vap->iv_opmode);
 		/* XXX stat */
 		return 0;
 	}
 
 	/*
 	 * XXX TODO: pull apart the bgscan logic into whatever
 	 * belongs here and whatever belongs in the software
 	 * scanner.
 	 */
 	return (ic->ic_scan_methods->sc_bg_scan(scan, vap, flags));
 }
 
 /*
  * Cancel any scan currently going on for the specified vap.
  */
 void
 ieee80211_cancel_scan(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	ic->ic_scan_methods->sc_cancel_scan(vap);
 }
 
 /*
  * Cancel any scan currently going on.
  *
  * This is called during normal 802.11 data path to cancel
  * a scan so a newly arrived normal data packet can be sent.
  */
 void
 ieee80211_cancel_anyscan(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	ic->ic_scan_methods->sc_cancel_anyscan(vap);
 }
 
 /*
  * Manually switch to the next channel in the channel list.
  * Provided for drivers that manage scanning themselves
  * (e.g. for firmware-based devices).
  */
 void
 ieee80211_scan_next(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	ic->ic_scan_methods->sc_scan_next(vap);
 }
 
 /*
  * Manually stop a scan that is currently running.
  * Provided for drivers that are not able to scan single channels
  * (e.g. for firmware-based devices).
  */
 void
 ieee80211_scan_done(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: called\n", __func__);
 
 	IEEE80211_LOCK(ic);
 	ss = ic->ic_scan;
 	ss->ss_next = ss->ss_last; /* all channels are complete */
 
 	ic->ic_scan_methods->sc_scan_done(vap);
 
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Probe the current channel, if allowed, while scanning.
  * If the channel is not marked passive-only then send
  * a probe request immediately.  Otherwise mark state and
  * listen for beacons on the channel; if we receive something
  * then we'll transmit a probe request.
  */
 void
 ieee80211_probe_curchan(struct ieee80211vap *vap, int force)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	if ((ic->ic_curchan->ic_flags & IEEE80211_CHAN_PASSIVE) && !force) {
 		ic->ic_flags_ext |= IEEE80211_FEXT_PROBECHAN;
 		return;
 	}
 
 	ic->ic_scan_methods->sc_scan_probe_curchan(vap, force);
 }
 
 #ifdef IEEE80211_DEBUG
 static void
 dump_country(const uint8_t *ie)
 {
 	const struct ieee80211_country_ie *cie =
 	   (const struct ieee80211_country_ie *) ie;
 	int i, nbands, schan, nchan;
 
 	if (cie->len < 3) {
 		printf(" <bogus country ie, len %d>", cie->len);
 		return;
 	}
 	printf(" country [%c%c%c", cie->cc[0], cie->cc[1], cie->cc[2]);
 	nbands = (cie->len - 3) / sizeof(cie->band[0]);
 	for (i = 0; i < nbands; i++) {
 		schan = cie->band[i].schan;
 		nchan = cie->band[i].nchan;
 		if (nchan != 1)
 			printf(" %u-%u,%u", schan, schan + nchan-1,
 			    cie->band[i].maxtxpwr);
 		else
 			printf(" %u,%u", schan, cie->band[i].maxtxpwr);
 	}
 	printf("]");
 }
 
 void
 ieee80211_scan_dump_probe_beacon(uint8_t subtype, int isnew,
 	const uint8_t mac[IEEE80211_ADDR_LEN],
 	const struct ieee80211_scanparams *sp, int rssi)
 {
 
 	printf("[%s] %s%s on chan %u (bss chan %u) ",
 	    ether_sprintf(mac), isnew ? "new " : "",
 	    ieee80211_mgt_subtype_name(subtype), sp->chan, sp->bchan);
 	ieee80211_print_essid(sp->ssid + 2, sp->ssid[1]);
 	printf(" rssi %d\n", rssi);
 
 	if (isnew) {
 		printf("[%s] caps 0x%x bintval %u erp 0x%x", 
 			ether_sprintf(mac), sp->capinfo, sp->bintval, sp->erp);
 		if (sp->country != NULL)
 			dump_country(sp->country);
 		printf("\n");
 	}
 }
 #endif /* IEEE80211_DEBUG */
 
 /*
  * Process a beacon or probe response frame.
  */
 void
 ieee80211_add_scan(struct ieee80211vap *vap,
 	struct ieee80211_channel *curchan,
 	const struct ieee80211_scanparams *sp,
 	const struct ieee80211_frame *wh,
 	int subtype, int rssi, int noise)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	return (ic->ic_scan_methods->sc_add_scan(vap, curchan, sp, wh, subtype,
 	    rssi, noise));
 }
 
 /*
  * Timeout/age scan cache entries; called from sta timeout
  * timer (XXX should be self-contained).
  */
 void
 ieee80211_scan_timeout(struct ieee80211com *ic)
 {
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 
 	if (ss->ss_ops != NULL)
 		ss->ss_ops->scan_age(ss);
 }
 
 /*
  * Mark a scan cache entry after a successful associate.
  */
 void
 ieee80211_scan_assoc_success(struct ieee80211vap *vap,
     const uint8_t mac[IEEE80211_ADDR_LEN])
 {
 	struct ieee80211_scan_state *ss = vap->iv_ic->ic_scan;
 
 	if (ss->ss_ops != NULL) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_SCAN,
 			mac, "%s",  __func__);
 		ss->ss_ops->scan_assoc_success(ss, mac);
 	}
 }
 
 /*
  * Demerit a scan cache entry after failing to associate.
  */
 void
 ieee80211_scan_assoc_fail(struct ieee80211vap *vap,
 	const uint8_t mac[IEEE80211_ADDR_LEN], int reason)
 {
 	struct ieee80211_scan_state *ss = vap->iv_ic->ic_scan;
 
 	if (ss->ss_ops != NULL) {
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_SCAN, mac,
 			"%s: reason %u", __func__, reason);
 		ss->ss_ops->scan_assoc_fail(ss, mac, reason);
 	}
 }
 
 /*
  * Iterate over the contents of the scan cache.
  */
 void
 ieee80211_scan_iterate(struct ieee80211vap *vap,
 	ieee80211_scan_iter_func *f, void *arg)
 {
 	struct ieee80211_scan_state *ss = vap->iv_ic->ic_scan;
 
 	if (ss->ss_ops != NULL)
 		ss->ss_ops->scan_iterate(ss, f, arg);
 }
 
 /*
  * Flush the contents of the scan cache.
  */
 void
 ieee80211_scan_flush(struct ieee80211vap *vap)
 {
 	struct ieee80211_scan_state *ss = vap->iv_ic->ic_scan;
 
 	if (ss->ss_ops != NULL && ss->ss_vap == vap) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s\n",  __func__);
 		ss->ss_ops->scan_flush(ss);
 	}
 }
 
 /*
  * Check the scan cache for an ap/channel to use; if that
  * fails then kick off a new scan.
  */
 struct ieee80211_channel *
 ieee80211_scan_pickchannel(struct ieee80211com *ic, int flags)
 {
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	if (ss == NULL || ss->ss_ops == NULL || ss->ss_vap == NULL) {
 		/* XXX printf? */
 		return NULL;
 	}
 	if (ss->ss_ops->scan_pickchan == NULL) {
 		IEEE80211_DPRINTF(ss->ss_vap, IEEE80211_MSG_SCAN,
 		    "%s: scan module does not support picking a channel, "
 		    "opmode %s\n", __func__, ss->ss_vap->iv_opmode);
 		return NULL;
 	}
 	return ss->ss_ops->scan_pickchan(ss, flags);
 }
diff --git a/sys/net80211/ieee80211_scan_sw.c b/sys/net80211/ieee80211_scan_sw.c
index 4c184095ad35..1456fafd60a1 100644
--- a/sys/net80211/ieee80211_scan_sw.c
+++ b/sys/net80211/ieee80211_scan_sw.c
@@ -1,1042 +1,1043 @@
 /*-
  * Copyright (c) 2002-2008 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * IEEE 802.11 scanning support.
  */
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/condvar.h>
 
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
+#include <net/if_private.h>
 #include <net/ethernet.h>
 
 #include <net80211/ieee80211_var.h>
 
 #include <net80211/ieee80211_scan_sw.h>
 
 #include <net/bpf.h>
 
 struct scan_state {
 	struct ieee80211_scan_state base;	/* public state */
 
 	u_int			ss_iflags;	/* flags used internally */
 #define	ISCAN_MINDWELL 		0x0001		/* min dwell time reached */
 #define	ISCAN_DISCARD		0x0002		/* discard rx'd frames */
 #define ISCAN_INTERRUPT		0x0004		/* interrupt current scan */
 #define	ISCAN_CANCEL		0x0008		/* cancel current scan */
 #define ISCAN_PAUSE		(ISCAN_INTERRUPT | ISCAN_CANCEL)
 #define	ISCAN_ABORT		0x0010		/* end the scan immediately */
 #define	ISCAN_RUNNING		0x0020		/* scan was started */
 
 	unsigned long		ss_chanmindwell;  /* min dwell on curchan */
 	unsigned long		ss_scanend;	/* time scan must stop */
 	u_int			ss_duration;	/* duration for next scan */
 	struct task		ss_scan_start;	/* scan start */
 	struct timeout_task	ss_scan_curchan;  /* scan execution */
 };
 #define	SCAN_PRIVATE(ss)	((struct scan_state *) ss)
 
 /*
  * Amount of time to go off-channel during a background
  * scan.  This value should be large enough to catch most
  * ap's but short enough that we can return on-channel
  * before our listen interval expires.
  *
  * XXX tunable
  * XXX check against configured listen interval
  */
 #define	IEEE80211_SCAN_OFFCHANNEL	msecs_to_ticks(150)
 
 static	void scan_curchan(struct ieee80211_scan_state *, unsigned long);
 static	void scan_mindwell(struct ieee80211_scan_state *);
 static	void scan_signal(struct ieee80211_scan_state *, int);
 static	void scan_signal_locked(struct ieee80211_scan_state *, int);
 static	void scan_start(void *, int);
 static	void scan_curchan_task(void *, int);
 static	void scan_end(struct ieee80211_scan_state *, int);
 static	void scan_done(struct ieee80211_scan_state *, int);
 
 MALLOC_DEFINE(M_80211_SCAN, "80211scan", "802.11 scan state");
 
 static void
 ieee80211_swscan_detach(struct ieee80211com *ic)
 {
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 
 	if (ss != NULL) {
 		scan_signal(ss, ISCAN_ABORT);
 		ieee80211_draintask(ic, &SCAN_PRIVATE(ss)->ss_scan_start);
 		taskqueue_drain_timeout(ic->ic_tq,
 		    &SCAN_PRIVATE(ss)->ss_scan_curchan);
 		KASSERT((ic->ic_flags & IEEE80211_F_SCAN) == 0,
 		    ("scan still running"));
 
 		/*
 		 * For now, do the ss_ops detach here rather
 		 * than ieee80211_scan_detach().
 		 *
 		 * I'll figure out how to cleanly split things up
 		 * at a later date.
 		 */
 		if (ss->ss_ops != NULL) {
 			ss->ss_ops->scan_detach(ss);
 			ss->ss_ops = NULL;
 		}
 		ic->ic_scan = NULL;
 		IEEE80211_FREE(SCAN_PRIVATE(ss), M_80211_SCAN);
 	}
 }
 
 static void
 ieee80211_swscan_vattach(struct ieee80211vap *vap)
 {
 	/* nothing to do for now */
 	/*
 	 * TODO: all of the vap scan calls should be methods!
 	 */
 
 }
 
 static void
 ieee80211_swscan_vdetach(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	if (ss != NULL && ss->ss_vap == vap &&
 	    (ic->ic_flags & IEEE80211_F_SCAN))
 		scan_signal_locked(ss, ISCAN_ABORT);
 }
 
 static void
 ieee80211_swscan_set_scan_duration(struct ieee80211vap *vap, u_int duration)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	/* NB: flush frames rx'd before 1st channel change */
 	SCAN_PRIVATE(ss)->ss_iflags |= ISCAN_DISCARD;
 	SCAN_PRIVATE(ss)->ss_duration = duration;
 }
 
 /*
  * Start a scan unless one is already going.
  */
 static int
 ieee80211_swscan_start_scan_locked(const struct ieee80211_scanner *scan,
 	struct ieee80211vap *vap, int flags, u_int duration,
 	u_int mindwell, u_int maxdwell,
 	u_int nssid, const struct ieee80211_scan_ssid ssids[])
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	if (ic->ic_flags & IEEE80211_F_CSAPENDING) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: scan inhibited by pending channel change\n", __func__);
 	} else if ((ic->ic_flags & IEEE80211_F_SCAN) == 0) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: %s scan, duration %u mindwell %u maxdwell %u, desired mode %s, %s%s%s%s%s%s\n"
 		    , __func__
 		    , flags & IEEE80211_SCAN_ACTIVE ? "active" : "passive"
 		    , duration, mindwell, maxdwell
 		    , ieee80211_phymode_name[vap->iv_des_mode]
 		    , flags & IEEE80211_SCAN_FLUSH ? "flush" : "append"
 		    , flags & IEEE80211_SCAN_NOPICK ? ", nopick" : ""
 		    , flags & IEEE80211_SCAN_NOJOIN ? ", nojoin" : ""
 		    , flags & IEEE80211_SCAN_NOBCAST ? ", nobcast" : ""
 		    , flags & IEEE80211_SCAN_PICK1ST ? ", pick1st" : ""
 		    , flags & IEEE80211_SCAN_ONCE ? ", once" : ""
 		);
 
 		ieee80211_scan_update_locked(vap, scan);
 		if (ss->ss_ops != NULL) {
 			if ((flags & IEEE80211_SCAN_NOSSID) == 0)
 				ieee80211_scan_copy_ssid(vap, ss, nssid, ssids);
 
 			/* NB: top 4 bits for internal use */
 			ss->ss_flags = flags & 0xfff;
 			if (ss->ss_flags & IEEE80211_SCAN_ACTIVE)
 				vap->iv_stats.is_scan_active++;
 			else
 				vap->iv_stats.is_scan_passive++;
 			if (flags & IEEE80211_SCAN_FLUSH)
 				ss->ss_ops->scan_flush(ss);
 			if (flags & IEEE80211_SCAN_BGSCAN)
 				ic->ic_flags_ext |= IEEE80211_FEXT_BGSCAN;
 
 			/* Set duration for this particular scan */
 			ieee80211_swscan_set_scan_duration(vap, duration);
 
 			ss->ss_next = 0;
 			ss->ss_mindwell = mindwell;
 			ss->ss_maxdwell = maxdwell;
 			/* NB: scan_start must be before the scan runtask */
 			ss->ss_ops->scan_start(ss, vap);
 #ifdef IEEE80211_DEBUG
 			if (ieee80211_msg_scan(vap))
 				ieee80211_scan_dump(ss);
 #endif /* IEEE80211_DEBUG */
 			ic->ic_flags |= IEEE80211_F_SCAN;
 
 			/* Start scan task */
 			ieee80211_runtask(ic, &SCAN_PRIVATE(ss)->ss_scan_start);
 		}
 		return 1;
 	} else {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: %s scan already in progress\n", __func__,
 		    ss->ss_flags & IEEE80211_SCAN_ACTIVE ? "active" : "passive");
 	}
 	return 0;
 }
 
 /*
  * Start a scan unless one is already going.
  *
  * Called without the comlock held; grab the comlock as appropriate.
  */
 static int
 ieee80211_swscan_start_scan(const struct ieee80211_scanner *scan,
     struct ieee80211vap *vap, int flags,
     u_int duration, u_int mindwell, u_int maxdwell,
     u_int nssid, const struct ieee80211_scan_ssid ssids[])
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	int result;
 
 	IEEE80211_UNLOCK_ASSERT(ic);
 
 	IEEE80211_LOCK(ic);
 	result = ieee80211_swscan_start_scan_locked(scan, vap, flags, duration,
 	    mindwell, maxdwell, nssid, ssids);
 	IEEE80211_UNLOCK(ic);
 
 	return result;
 }
 
 /*
  * Check the scan cache for an ap/channel to use; if that
  * fails then kick off a new scan.
  *
  * Called with the comlock held.
  *
  * XXX TODO: split out!
  */
 static int
 ieee80211_swscan_check_scan(const struct ieee80211_scanner *scan,
     struct ieee80211vap *vap, int flags,
     u_int duration, u_int mindwell, u_int maxdwell,
     u_int nssid, const struct ieee80211_scan_ssid ssids[])
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 	int result;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	if (ss->ss_ops != NULL) {
 		/* XXX verify ss_ops matches vap->iv_opmode */
 		if ((flags & IEEE80211_SCAN_NOSSID) == 0) {
 			/*
 			 * Update the ssid list and mark flags so if
 			 * we call start_scan it doesn't duplicate work.
 			 */
 			ieee80211_scan_copy_ssid(vap, ss, nssid, ssids);
 			flags |= IEEE80211_SCAN_NOSSID;
 		}
 		if ((ic->ic_flags & IEEE80211_F_SCAN) == 0 &&
 		    (flags & IEEE80211_SCAN_FLUSH) == 0 &&
 		    ieee80211_time_before(ticks, ic->ic_lastscan + vap->iv_scanvalid)) {
 			/*
 			 * We're not currently scanning and the cache is
 			 * deemed hot enough to consult.  Lock out others
 			 * by marking IEEE80211_F_SCAN while we decide if
 			 * something is already in the scan cache we can
 			 * use.  Also discard any frames that might come
 			 * in while temporarily marked as scanning.
 			 */
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 			    "cache hot; ic_lastscan=%d, scanvalid=%d, ticks=%d\n",
 			    ic->ic_lastscan,
 			    vap->iv_scanvalid,
 			    ticks);
 			SCAN_PRIVATE(ss)->ss_iflags |= ISCAN_DISCARD;
 			ic->ic_flags |= IEEE80211_F_SCAN;
 
 			/* NB: need to use supplied flags in check */
 			ss->ss_flags = flags & 0xff;
 			result = ss->ss_ops->scan_end(ss, vap);
 
 			ic->ic_flags &= ~IEEE80211_F_SCAN;
 			SCAN_PRIVATE(ss)->ss_iflags &= ~ISCAN_DISCARD;
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 			    "%s: scan_end returned %d\n", __func__, result);
 			if (result) {
 				ieee80211_notify_scan_done(vap);
 				return 1;
 			}
 		}
 	}
 	result = ieee80211_swscan_start_scan_locked(scan, vap, flags, duration,
 	    mindwell, maxdwell, nssid, ssids);
 
 	return result;
 }
 
 /*
  * Restart a previous scan.  If the previous scan completed
  * then we start again using the existing channel list.
  */
 static int
 ieee80211_swscan_bg_scan(const struct ieee80211_scanner *scan,
     struct ieee80211vap *vap, int flags)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 
 	/* XXX assert unlocked? */
 	// IEEE80211_UNLOCK_ASSERT(ic);
 
 	IEEE80211_LOCK(ic);
 	if ((ic->ic_flags & IEEE80211_F_SCAN) == 0) {
 		u_int duration;
 		/*
 		 * Go off-channel for a fixed interval that is large
 		 * enough to catch most ap's but short enough that
 		 * we can return on-channel before our listen interval
 		 * expires.
 		 */
 		duration = IEEE80211_SCAN_OFFCHANNEL;
 
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: %s scan, ticks %u duration %u\n", __func__,
 		    ss->ss_flags & IEEE80211_SCAN_ACTIVE ? "active" : "passive",
 		    ticks, duration);
 
 		ieee80211_scan_update_locked(vap, scan);
 		if (ss->ss_ops != NULL) {
 			ss->ss_vap = vap;
 			/*
 			 * A background scan does not select a new sta; it
 			 * just refreshes the scan cache.  Also, indicate
 			 * the scan logic should follow the beacon schedule:
 			 * we go off-channel and scan for a while, then
 			 * return to the bss channel to receive a beacon,
 			 * then go off-channel again.  All during this time
 			 * we notify the ap we're in power save mode.  When
 			 * the scan is complete we leave power save mode.
 			 * If any beacon indicates there are frames pending
 			 * for us then we drop out of power save mode
 			 * (and background scan) automatically by way of the
 			 * usual sta power save logic.
 			 */
 			ss->ss_flags |= IEEE80211_SCAN_NOPICK
 				     |  IEEE80211_SCAN_BGSCAN
 				     |  flags
 				     ;
 			/* if previous scan completed, restart */
 			if (ss->ss_next >= ss->ss_last) {
 				if (ss->ss_flags & IEEE80211_SCAN_ACTIVE)
 					vap->iv_stats.is_scan_active++;
 				else
 					vap->iv_stats.is_scan_passive++;
 				/*
 				 * NB: beware of the scan cache being flushed;
 				 *     if the channel list is empty use the
 				 *     scan_start method to populate it.
 				 */
 				ss->ss_next = 0;
 				if (ss->ss_last != 0)
 					ss->ss_ops->scan_restart(ss, vap);
 				else {
 					ss->ss_ops->scan_start(ss, vap);
 #ifdef IEEE80211_DEBUG
 					if (ieee80211_msg_scan(vap))
 						ieee80211_scan_dump(ss);
 #endif /* IEEE80211_DEBUG */
 				}
 			}
 			ieee80211_swscan_set_scan_duration(vap, duration);
 			ss->ss_maxdwell = duration;
 			ic->ic_flags |= IEEE80211_F_SCAN;
 			ic->ic_flags_ext |= IEEE80211_FEXT_BGSCAN;
 			ieee80211_runtask(ic,
 			    &SCAN_PRIVATE(ss)->ss_scan_start);
 		} else {
 			/* XXX msg+stat */
 		}
 	} else {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: %s scan already in progress\n", __func__,
 		    ss->ss_flags & IEEE80211_SCAN_ACTIVE ? "active" : "passive");
 	}
 	IEEE80211_UNLOCK(ic);
 
 	/* NB: racey, does it matter? */
 	return (ic->ic_flags & IEEE80211_F_SCAN);
 }
 
 /*
  * Taskqueue work to cancel a scan.
  *
  * Note: for offload scan devices, we may want to call into the
  * driver to try and cancel scanning, however it may not be cancelable.
  */
 static void
 cancel_scan(struct ieee80211vap *vap, int any, const char *func)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 	struct scan_state *ss_priv = SCAN_PRIVATE(ss);
 	int signal;
 
 	IEEE80211_LOCK(ic);
 	signal = any ? ISCAN_PAUSE : ISCAN_CANCEL;
 	if ((ic->ic_flags & IEEE80211_F_SCAN) &&
 	    (any || ss->ss_vap == vap) &&
 	    (ss_priv->ss_iflags & signal) == 0) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: %s %s scan\n", func,
 		    any ? "pause" : "cancel",
 		    ss->ss_flags & IEEE80211_SCAN_ACTIVE ?
 			"active" : "passive");
 
 		/* clear bg scan NOPICK */
 		ss->ss_flags &= ~IEEE80211_SCAN_NOPICK;
 		/* mark request and wake up the scan task */
 		scan_signal_locked(ss, signal);
 	} else {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: called; F_SCAN=%d, vap=%s, signal=%d\n",
 			func,
 			!! (ic->ic_flags & IEEE80211_F_SCAN),
 			(ss->ss_vap == vap ? "match" : "nomatch"),
 			!! (ss_priv->ss_iflags & signal));
 	}
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Cancel any scan currently going on for the specified vap.
  */
 static void
 ieee80211_swscan_cancel_scan(struct ieee80211vap *vap)
 {
 	cancel_scan(vap, 0, __func__);
 }
 
 /*
  * Cancel any scan currently going on.
  */
 static void
 ieee80211_swscan_cancel_anyscan(struct ieee80211vap *vap)
 {
 
 	/* XXX for now - just don't do this per packet. */
 	if (vap->iv_flags_ext & IEEE80211_FEXT_SCAN_OFFLOAD)
 		return;
 
 	cancel_scan(vap, 1, __func__);
 }
 
 /*
  * Manually switch to the next channel in the channel list.
  * Provided for drivers that manage scanning themselves
  * (e.g. for firmware-based devices).
  */
 static void
 ieee80211_swscan_scan_next(struct ieee80211vap *vap)
 {
 	struct ieee80211_scan_state *ss = vap->iv_ic->ic_scan;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: called\n", __func__);
 
 	/* wake up the scan task */
 	scan_signal(ss, 0);
 }
 
 /*
  * Manually stop a scan that is currently running.
  * Provided for drivers that are not able to scan single channels
  * (e.g. for firmware-based devices).
  */
 static void
 ieee80211_swscan_scan_done(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	scan_signal_locked(ss, 0);
 }
 
 /*
  * Probe the current channel, if allowed, while scanning.
  * If the channel is not marked passive-only then send
  * a probe request immediately.  Otherwise mark state and
  * listen for beacons on the channel; if we receive something
  * then we'll transmit a probe request.
  */
 static void
 ieee80211_swscan_probe_curchan(struct ieee80211vap *vap, int force)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 	struct ifnet *ifp = vap->iv_ifp;
 	int i;
 
 	/*
 	 * Full-offload scan devices don't require this.
 	 */
 	if (vap->iv_flags_ext & IEEE80211_FEXT_SCAN_OFFLOAD)
 		return;
 
 	/*
 	 * Send directed probe requests followed by any
 	 * broadcast probe request.
 	 * XXX remove dependence on ic/vap->iv_bss
 	 */
 	for (i = 0; i < ss->ss_nssid; i++)
 		ieee80211_send_probereq(vap->iv_bss,
 			vap->iv_myaddr, ifp->if_broadcastaddr,
 			ifp->if_broadcastaddr,
 			ss->ss_ssid[i].ssid, ss->ss_ssid[i].len);
 	if ((ss->ss_flags & IEEE80211_SCAN_NOBCAST) == 0)
 		ieee80211_send_probereq(vap->iv_bss,
 			vap->iv_myaddr, ifp->if_broadcastaddr,
 			ifp->if_broadcastaddr,
 			"", 0);
 }
 
 /*
  * Scan curchan.  If this is an active scan and the channel
  * is not marked passive then send probe request frame(s).
  * Arrange for the channel change after maxdwell ticks.
  */
 static void
 scan_curchan(struct ieee80211_scan_state *ss, unsigned long maxdwell)
 {
 	struct ieee80211vap *vap  = ss->ss_vap;
 	struct ieee80211com *ic = ss->ss_ic;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 	    "%s: calling; maxdwell=%lu\n",
 	    __func__,
 	    maxdwell);
 	IEEE80211_LOCK(ic);
 	if (ss->ss_flags & IEEE80211_SCAN_ACTIVE)
 		ieee80211_probe_curchan(vap, 0);
 	taskqueue_enqueue_timeout(ic->ic_tq,
 	    &SCAN_PRIVATE(ss)->ss_scan_curchan, maxdwell);
 	IEEE80211_UNLOCK(ic);
 }
 
 static void
 scan_signal(struct ieee80211_scan_state *ss, int iflags)
 {
 	struct ieee80211com *ic = ss->ss_ic;
 
 	IEEE80211_UNLOCK_ASSERT(ic);
 
 	IEEE80211_LOCK(ic);
 	scan_signal_locked(ss, iflags);
 	IEEE80211_UNLOCK(ic);
 }
 
 static void
 scan_signal_locked(struct ieee80211_scan_state *ss, int iflags)
 {
 	struct scan_state *ss_priv = SCAN_PRIVATE(ss);
 	struct timeout_task *scan_task = &ss_priv->ss_scan_curchan;
 	struct ieee80211com *ic = ss->ss_ic;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	ss_priv->ss_iflags |= iflags;
 	if (ss_priv->ss_iflags & ISCAN_RUNNING) {
 		if (taskqueue_cancel_timeout(ic->ic_tq, scan_task, NULL) == 0)
 			taskqueue_enqueue_timeout(ic->ic_tq, scan_task, 0);
 	}
 }
 
 /*
  * Handle mindwell requirements completed; initiate a channel
  * change to the next channel asap.
  */
 static void
 scan_mindwell(struct ieee80211_scan_state *ss)
 {
 
 	IEEE80211_DPRINTF(ss->ss_vap, IEEE80211_MSG_SCAN, "%s: called\n",
 	    __func__);
 
 	scan_signal(ss, 0);
 }
 
 static void
 scan_start(void *arg, int pending)
 {
 #define	ISCAN_REP	(ISCAN_MINDWELL | ISCAN_DISCARD)
 	struct ieee80211_scan_state *ss = (struct ieee80211_scan_state *) arg;
 	struct scan_state *ss_priv = SCAN_PRIVATE(ss);
 	struct ieee80211vap *vap = ss->ss_vap;
 	struct ieee80211com *ic = ss->ss_ic;
 
 	IEEE80211_LOCK(ic);
 	if (vap == NULL || (ic->ic_flags & IEEE80211_F_SCAN) == 0 ||
 	    (ss_priv->ss_iflags & ISCAN_ABORT)) {
 		/* Cancelled before we started */
 		scan_done(ss, 0);
 		return;
 	}
 
 	if (ss->ss_next == ss->ss_last) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 			"%s: no channels to scan\n", __func__);
 		scan_done(ss, 1);
 		return;
 	}
 
 	/*
 	 * Put the station into power save mode.
 	 *
 	 * This is only required if we're not a full-offload devices;
 	 * those devices manage scan/traffic differently.
 	 */
 	if (((vap->iv_flags_ext & IEEE80211_FEXT_SCAN_OFFLOAD) == 0) &&
 	    vap->iv_opmode == IEEE80211_M_STA &&
 	    vap->iv_state == IEEE80211_S_RUN) {
 		if ((vap->iv_bss->ni_flags & IEEE80211_NODE_PWR_MGT) == 0) {
 			/* Enable station power save mode */
 			vap->iv_sta_ps(vap, 1);
 			/* Wait until null data frame will be ACK'ed */
 			mtx_sleep(vap, IEEE80211_LOCK_OBJ(ic), PCATCH,
 			    "sta_ps", msecs_to_ticks(10));
 			if (ss_priv->ss_iflags & ISCAN_ABORT) {
 				scan_done(ss, 0);
 				return;
 			}
 		}
 	}
 
 	ss_priv->ss_scanend = ticks + ss_priv->ss_duration;
 
 	/* XXX scan state can change! Re-validate scan state! */
 
 	IEEE80211_UNLOCK(ic);
 
 	ic->ic_scan_start(ic);		/* notify driver */
 
 	scan_curchan_task(ss, 0);
 }
 
 static void
 scan_curchan_task(void *arg, int pending)
 {
 	struct ieee80211_scan_state *ss = arg;
 	struct scan_state *ss_priv = SCAN_PRIVATE(ss);
 	struct ieee80211com *ic = ss->ss_ic;
 	struct ieee80211_channel *chan;
 	unsigned long maxdwell;
 	int scandone, scanstop;
 
 	IEEE80211_LOCK(ic);
 end:
 	/*
 	 * Note: only /end/ the scan if we're CANCEL rather than
 	 * CANCEL+INTERRUPT (ie, 'PAUSE').
 	 *
 	 * We can stop the scan if we hit cancel, but we shouldn't
 	 * call scan_end(ss, 1) if we're just PAUSEing the scan.
 	 */
 	scandone = (ss->ss_next >= ss->ss_last) ||
 	    ((ss_priv->ss_iflags & ISCAN_PAUSE) == ISCAN_CANCEL);
 	scanstop = (ss->ss_next >= ss->ss_last) ||
 	    ((ss_priv->ss_iflags & ISCAN_CANCEL) != 0);
 
 	IEEE80211_DPRINTF(ss->ss_vap, IEEE80211_MSG_SCAN,
 	    "%s: loop start; scandone=%d, scanstop=%d, ss_iflags=0x%x, ss_next=%u, ss_last=%u\n",
 	    __func__,
 	    scandone,
 	    scanstop,
 	    (uint32_t) ss_priv->ss_iflags,
 	    (uint32_t) ss->ss_next,
 	    (uint32_t) ss->ss_last);
 
 	if (scanstop || (ss->ss_flags & IEEE80211_SCAN_GOTPICK) ||
 	    (ss_priv->ss_iflags & ISCAN_ABORT) ||
 	     ieee80211_time_after(ticks + ss->ss_mindwell, ss_priv->ss_scanend)) {
 		ss_priv->ss_iflags &= ~ISCAN_RUNNING;
 		scan_end(ss, scandone);
 		return;
 	} else
 		ss_priv->ss_iflags |= ISCAN_RUNNING;
 
 	chan = ss->ss_chans[ss->ss_next++];
 
 	/*
 	 * Watch for truncation due to the scan end time.
 	 */
 	if (ieee80211_time_after(ticks + ss->ss_maxdwell, ss_priv->ss_scanend))
 		maxdwell = ss_priv->ss_scanend - ticks;
 	else
 		maxdwell = ss->ss_maxdwell;
 
 	IEEE80211_DPRINTF(ss->ss_vap, IEEE80211_MSG_SCAN,
 	    "%s: chan %3d%c -> %3d%c [%s, dwell min %lums max %lums]\n",
 	    __func__,
 	    ieee80211_chan2ieee(ic, ic->ic_curchan),
 	    ieee80211_channel_type_char(ic->ic_curchan),
 	    ieee80211_chan2ieee(ic, chan),
 	    ieee80211_channel_type_char(chan),
 	    (ss->ss_flags & IEEE80211_SCAN_ACTIVE) &&
 		(chan->ic_flags & IEEE80211_CHAN_PASSIVE) == 0 ?
 		"active" : "passive",
 	    ticks_to_msecs(ss->ss_mindwell), ticks_to_msecs(maxdwell));
 
 	/*
 	 * Potentially change channel and phy mode.
 	 */
 	ic->ic_curchan = chan;
 	ic->ic_rt = ieee80211_get_ratetable(chan);
 	IEEE80211_UNLOCK(ic);
 	/*
 	 * Perform the channel change and scan unlocked so the driver
 	 * may sleep. Once set_channel returns the hardware has
 	 * completed the channel change.
 	 */
 	ic->ic_set_channel(ic);
 	ieee80211_radiotap_chan_change(ic);
 
 	/*
 	 * Scan curchan.  Drivers for "intelligent hardware"
 	 * override ic_scan_curchan to tell the device to do
 	 * the work.  Otherwise we manage the work ourselves;
 	 * sending a probe request (as needed), and arming the
 	 * timeout to switch channels after maxdwell ticks.
 	 *
 	 * scan_curchan should only pause for the time required to
 	 * prepare/initiate the hardware for the scan (if at all).
 	 */
 	ic->ic_scan_curchan(ss, maxdwell);
 	IEEE80211_LOCK(ic);
 
 	/* XXX scan state can change! Re-validate scan state! */
 
 	ss_priv->ss_chanmindwell = ticks + ss->ss_mindwell;
 	/* clear mindwell lock and initial channel change flush */
 	ss_priv->ss_iflags &= ~ISCAN_REP;
 
 	if (ss_priv->ss_iflags & (ISCAN_CANCEL|ISCAN_ABORT)) {
 		taskqueue_cancel_timeout(ic->ic_tq, &ss_priv->ss_scan_curchan,
 		    NULL);
 		goto end;
 	}
 
 	IEEE80211_DPRINTF(ss->ss_vap, IEEE80211_MSG_SCAN, "%s: waiting\n",
 	    __func__);
 	IEEE80211_UNLOCK(ic);
 }
 
 static void
 scan_end(struct ieee80211_scan_state *ss, int scandone)
 {
 	struct scan_state *ss_priv = SCAN_PRIVATE(ss);
 	struct ieee80211vap *vap = ss->ss_vap;
 	struct ieee80211com *ic = ss->ss_ic;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN, "%s: out\n", __func__);
 
 	if (ss_priv->ss_iflags & ISCAN_ABORT) {
 		scan_done(ss, scandone);
 		return;
 	}
 
 	IEEE80211_UNLOCK(ic);
 	ic->ic_scan_end(ic);		/* notify driver */
 	IEEE80211_LOCK(ic);
 	/* XXX scan state can change! Re-validate scan state! */
 
 	/*
 	 * Since a cancellation may have occurred during one of the
 	 * driver calls (whilst unlocked), update scandone.
 	 */
 	if ((scandone == 0) && ((ss_priv->ss_iflags & ISCAN_PAUSE) == ISCAN_CANCEL)) {
 		/* XXX printf? */
 		if_printf(vap->iv_ifp,
 		    "%s: OOPS! scan cancelled during driver call (1) (ss_iflags=0x%x)!\n",
 		    __func__,
 		    ss_priv->ss_iflags);
 		scandone = 1;
 	}
 
 	/*
 	 * Record scan complete time.  Note that we also do
 	 * this when canceled so any background scan will
 	 * not be restarted for a while.
 	 */
 	if (scandone)
 		ic->ic_lastscan = ticks;
 	/* return to the bss channel */
 	if (ic->ic_bsschan != IEEE80211_CHAN_ANYC &&
 	    ic->ic_curchan != ic->ic_bsschan) {
 		ieee80211_setupcurchan(ic, ic->ic_bsschan);
 		IEEE80211_UNLOCK(ic);
 		ic->ic_set_channel(ic);
 		ieee80211_radiotap_chan_change(ic);
 		IEEE80211_LOCK(ic);
 	}
 	/* clear internal flags and any indication of a pick */
 	ss_priv->ss_iflags &= ~ISCAN_REP;
 	ss->ss_flags &= ~IEEE80211_SCAN_GOTPICK;
 
 	/*
 	 * If not canceled and scan completed, do post-processing.
 	 * If the callback function returns 0, then it wants to
 	 * continue/restart scanning.  Unfortunately we needed to
 	 * notify the driver to end the scan above to avoid having
 	 * rx frames alter the scan candidate list.
 	 */
 	if ((ss_priv->ss_iflags & ISCAN_CANCEL) == 0 &&
 	    !ss->ss_ops->scan_end(ss, vap) &&
 	    (ss->ss_flags & IEEE80211_SCAN_ONCE) == 0 &&
 	    ieee80211_time_before(ticks + ss->ss_mindwell, ss_priv->ss_scanend)) {
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 		    "%s: done, restart "
 		    "[ticks %u, dwell min %lu scanend %lu]\n",
 		    __func__,
 		    ticks, ss->ss_mindwell, ss_priv->ss_scanend);
 		ss->ss_next = 0;	/* reset to beginning */
 		if (ss->ss_flags & IEEE80211_SCAN_ACTIVE)
 			vap->iv_stats.is_scan_active++;
 		else
 			vap->iv_stats.is_scan_passive++;
 
 		ss->ss_ops->scan_restart(ss, vap);	/* XXX? */
 		ieee80211_runtask(ic, &ss_priv->ss_scan_start);
 		IEEE80211_UNLOCK(ic);
 		return;
 	}
 
 	/* past here, scandone is ``true'' if not in bg mode */
 	if ((ss->ss_flags & IEEE80211_SCAN_BGSCAN) == 0)
 		scandone = 1;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 	    "%s: %s, [ticks %u, dwell min %lu scanend %lu]\n",
 	    __func__, scandone ? "done" : "stopped",
 	    ticks, ss->ss_mindwell, ss_priv->ss_scanend);
 
 	/*
 	 * Since a cancellation may have occurred during one of the
 	 * driver calls (whilst unlocked), update scandone.
 	 */
 	if (scandone == 0 && (ss_priv->ss_iflags & ISCAN_PAUSE) == ISCAN_CANCEL) {
 		/* XXX printf? */
 		if_printf(vap->iv_ifp,
 		    "%s: OOPS! scan cancelled during driver call (2) (ss_iflags=0x%x)!\n",
 		    __func__,
 		    ss_priv->ss_iflags);
 		scandone = 1;
 	}
 
 	scan_done(ss, scandone);
 }
 
 static void
 scan_done(struct ieee80211_scan_state *ss, int scandone)
 {
 	struct scan_state *ss_priv = SCAN_PRIVATE(ss);
 	struct ieee80211com *ic = ss->ss_ic;
 	struct ieee80211vap *vap = ss->ss_vap;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	/*
 	 * Clear the SCAN bit first in case frames are
 	 * pending on the station power save queue.  If
 	 * we defer this then the dispatch of the frames
 	 * may generate a request to cancel scanning.
 	 */
 	ic->ic_flags &= ~IEEE80211_F_SCAN;
 
 	/*
 	 * Drop out of power save mode when a scan has
 	 * completed.  If this scan was prematurely terminated
 	 * because it is a background scan then don't notify
 	 * the ap; we'll either return to scanning after we
 	 * receive the beacon frame or we'll drop out of power
 	 * save mode because the beacon indicates we have frames
 	 * waiting for us.
 	 */
 	if (scandone) {
 		/*
 		 * If we're not a scan offload device, come back out of
 		 * station powersave.  Offload devices handle this themselves.
 		 */
 		if ((vap->iv_flags_ext & IEEE80211_FEXT_SCAN_OFFLOAD) == 0)
 			vap->iv_sta_ps(vap, 0);
 		if (ss->ss_next >= ss->ss_last) {
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 			    "%s: Dropping out of scan; ss_next=%u, ss_last=%u\n",
 			    __func__,
 			    (uint32_t) ss->ss_next,
 			    (uint32_t) ss->ss_last);
 			ic->ic_flags_ext &= ~IEEE80211_FEXT_BGSCAN;
 		}
 
 		/* send 'scan done' event if not interrupted due to traffic. */
 		if (!(ss_priv->ss_iflags & ISCAN_INTERRUPT) ||
 		    (ss->ss_next >= ss->ss_last))
 			ieee80211_notify_scan_done(vap);
 	}
 	ss_priv->ss_iflags &= ~(ISCAN_PAUSE | ISCAN_ABORT);
 	ss_priv->ss_scanend = 0;
 	ss->ss_flags &= ~(IEEE80211_SCAN_ONCE | IEEE80211_SCAN_PICK1ST);
 	IEEE80211_UNLOCK(ic);
 #undef ISCAN_REP
 }
 
 /*
  * Process a beacon or probe response frame.
  */
 static void
 ieee80211_swscan_add_scan(struct ieee80211vap *vap,
 	struct ieee80211_channel *curchan,
 	const struct ieee80211_scanparams *sp,
 	const struct ieee80211_frame *wh,
 	int subtype, int rssi, int noise)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_scan_state *ss = ic->ic_scan;
 
 	/* XXX locking */
 	/*
 	 * Frames received during startup are discarded to avoid
 	 * using scan state setup on the initial entry to the timer
 	 * callback.  This can occur because the device may enable
 	 * rx prior to our doing the initial channel change in the
 	 * timer routine.
 	 */
 	if (SCAN_PRIVATE(ss)->ss_iflags & ISCAN_DISCARD)
 		return;
 #ifdef IEEE80211_DEBUG
 	if (ieee80211_msg_scan(vap) && (ic->ic_flags & IEEE80211_F_SCAN))
 		ieee80211_scan_dump_probe_beacon(subtype, 1, wh->i_addr2, sp, rssi);
 #endif
 	if (ss->ss_ops != NULL &&
 	    ss->ss_ops->scan_add(ss, curchan, sp, wh, subtype, rssi, noise)) {
 		/*
 		 * If we've reached the min dwell time terminate
 		 * the timer so we'll switch to the next channel.
 		 */
 		if ((SCAN_PRIVATE(ss)->ss_iflags & ISCAN_MINDWELL) == 0 &&
 		    ieee80211_time_after_eq(ticks, SCAN_PRIVATE(ss)->ss_chanmindwell)) {
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_SCAN,
 			    "%s: chan %3d%c min dwell met (%u > %lu)\n",
 			    __func__,
 			    ieee80211_chan2ieee(ic, ic->ic_curchan),
 			    ieee80211_channel_type_char(ic->ic_curchan),
 			    ticks, SCAN_PRIVATE(ss)->ss_chanmindwell);
 			SCAN_PRIVATE(ss)->ss_iflags |= ISCAN_MINDWELL;
 			/*
 			 * NB: trigger at next clock tick or wait for the
 			 * hardware.
 			 */
 			ic->ic_scan_mindwell(ss);
 		}
 	}
 }
 
 static struct ieee80211_scan_methods swscan_methods = {
 	.sc_attach = ieee80211_swscan_attach,
 	.sc_detach = ieee80211_swscan_detach,
 	.sc_vattach = ieee80211_swscan_vattach,
 	.sc_vdetach = ieee80211_swscan_vdetach,
 	.sc_set_scan_duration = ieee80211_swscan_set_scan_duration,
 	.sc_start_scan = ieee80211_swscan_start_scan,
 	.sc_check_scan = ieee80211_swscan_check_scan,
 	.sc_bg_scan = ieee80211_swscan_bg_scan,
 	.sc_cancel_scan = ieee80211_swscan_cancel_scan,
 	.sc_cancel_anyscan = ieee80211_swscan_cancel_anyscan,
 	.sc_scan_next = ieee80211_swscan_scan_next,
 	.sc_scan_done = ieee80211_swscan_scan_done,
 	.sc_scan_probe_curchan = ieee80211_swscan_probe_curchan,
 	.sc_add_scan = ieee80211_swscan_add_scan
 };
 
 /*
  * Default scan attach method.
  */
 void
 ieee80211_swscan_attach(struct ieee80211com *ic)
 {
 	struct scan_state *ss;
 
 	/*
 	 * Setup the default methods
 	 */
 	ic->ic_scan_methods = &swscan_methods;
 
 	/* Allocate initial scan state */
 	ss = (struct scan_state *) IEEE80211_MALLOC(sizeof(struct scan_state),
 		M_80211_SCAN, IEEE80211_M_NOWAIT | IEEE80211_M_ZERO);
 	if (ss == NULL) {
 		ic->ic_scan = NULL;
 		return;
 	}
 	TASK_INIT(&ss->ss_scan_start, 0, scan_start, ss);
 	TIMEOUT_TASK_INIT(ic->ic_tq, &ss->ss_scan_curchan, 0,
 	    scan_curchan_task, ss);
 
 	ic->ic_scan = &ss->base;
 	ss->base.ss_ic = ic;
 
 	ic->ic_scan_curchan = scan_curchan;
 	ic->ic_scan_mindwell = scan_mindwell;
 }
diff --git a/sys/net80211/ieee80211_sta.c b/sys/net80211/ieee80211_sta.c
index 719df1bcfacc..7dde2a609459 100644
--- a/sys/net80211/ieee80211_sta.c
+++ b/sys/net80211/ieee80211_sta.c
@@ -1,2070 +1,2071 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007-2008 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #ifdef __FreeBSD__
 __FBSDID("$FreeBSD$");
 #endif
 
 /*
  * IEEE 802.11 Station mode support.
  */
 #include "opt_inet.h"
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/mbuf.h>   
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_media.h>
 #include <net/if_llc.h>
 #include <net/if_dl.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/ethernet.h>
 
 #include <net/bpf.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_sta.h>
 #include <net80211/ieee80211_input.h>
 #ifdef IEEE80211_SUPPORT_SUPERG
 #include <net80211/ieee80211_superg.h>
 #endif
 #include <net80211/ieee80211_ratectl.h>
 #include <net80211/ieee80211_sta.h>
 #include <net80211/ieee80211_vht.h>
 
 #define	IEEE80211_RATE2MBS(r)	(((r) & IEEE80211_RATE_VAL) / 2)
 
 static	void sta_vattach(struct ieee80211vap *);
 static	void sta_beacon_miss(struct ieee80211vap *);
 static	int sta_newstate(struct ieee80211vap *, enum ieee80211_state, int);
 static	int sta_input(struct ieee80211_node *, struct mbuf *,
 	    const struct ieee80211_rx_stats *, int, int);
 static void sta_recv_mgmt(struct ieee80211_node *, struct mbuf *,
 	    int subtype, const struct ieee80211_rx_stats *, int rssi, int nf);
 static void sta_recv_ctl(struct ieee80211_node *, struct mbuf *, int subtype);
 
 void
 ieee80211_sta_attach(struct ieee80211com *ic)
 {
 	ic->ic_vattach[IEEE80211_M_STA] = sta_vattach;
 }
 
 void
 ieee80211_sta_detach(struct ieee80211com *ic)
 {
 }
 
 static void
 sta_vdetach(struct ieee80211vap *vap)
 {
 }
 
 static void
 sta_vattach(struct ieee80211vap *vap)
 {
 	vap->iv_newstate = sta_newstate;
 	vap->iv_input = sta_input;
 	vap->iv_recv_mgmt = sta_recv_mgmt;
 	vap->iv_recv_ctl = sta_recv_ctl;
 	vap->iv_opdetach = sta_vdetach;
 	vap->iv_bmiss = sta_beacon_miss;
 }
 
 /*
  * Handle a beacon miss event.  The common code filters out
  * spurious events that can happen when scanning and/or before
  * reaching RUN state.
  */
 static void
 sta_beacon_miss(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	KASSERT((ic->ic_flags & IEEE80211_F_SCAN) == 0, ("scanning"));
 	KASSERT(vap->iv_state >= IEEE80211_S_RUN,
 	    ("wrong state %s", ieee80211_state_name[vap->iv_state]));
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE | IEEE80211_MSG_DEBUG,
 	    "beacon miss, mode %s state %s\n",
 	    ieee80211_opmode_name[vap->iv_opmode],
 	    ieee80211_state_name[vap->iv_state]);
 
 	if (vap->iv_state == IEEE80211_S_CSA) {
 		/*
 		 * A Channel Switch is pending; assume we missed the
 		 * beacon that would've completed the process and just
 		 * force the switch.  If we made a mistake we'll not
 		 * find the AP on the new channel and fall back to a
 		 * normal scan.
 		 */
 		ieee80211_csa_completeswitch(ic);
 		return;
 	}
 	if (++vap->iv_bmiss_count < vap->iv_bmiss_max) {
 		/*
 		 * Send a directed probe req before falling back to a
 		 * scan; if we receive a response ic_bmiss_count will
 		 * be reset.  Some cards mistakenly report beacon miss
 		 * so this avoids the expensive scan if the ap is
 		 * still there.
 		 */
 		ieee80211_send_probereq(vap->iv_bss, vap->iv_myaddr,
 			vap->iv_bss->ni_bssid, vap->iv_bss->ni_bssid,
 			vap->iv_bss->ni_essid, vap->iv_bss->ni_esslen);
 		return;
 	}
 
 	callout_stop(&vap->iv_swbmiss);
 	vap->iv_bmiss_count = 0;
 	vap->iv_stats.is_beacon_miss++;
 	if (vap->iv_roaming == IEEE80211_ROAMING_AUTO) {
 #ifdef IEEE80211_SUPPORT_SUPERG
 
 		/*
 		 * If we receive a beacon miss interrupt when using
 		 * dynamic turbo, attempt to switch modes before
 		 * reassociating.
 		 */
 		if (IEEE80211_ATH_CAP(vap, vap->iv_bss, IEEE80211_NODE_TURBOP))
 			ieee80211_dturbo_switch(vap,
 			    ic->ic_bsschan->ic_flags ^ IEEE80211_CHAN_TURBO);
 #endif
 		/*
 		 * Try to reassociate before scanning for a new ap.
 		 */
 		ieee80211_new_state(vap, IEEE80211_S_ASSOC, 1);
 	} else {
 		/*
 		 * Somebody else is controlling state changes (e.g.
 		 * a user-mode app) don't do anything that would
 		 * confuse them; just drop into scan mode so they'll
 		 * notified of the state change and given control.
 		 */
 		ieee80211_new_state(vap, IEEE80211_S_SCAN, 0);
 	}
 }
 
 /*
  * Handle deauth with reason.  We retry only for
  * the cases where we might succeed.  Otherwise
  * we downgrade the ap and scan.
  */
 static void
 sta_authretry(struct ieee80211vap *vap, struct ieee80211_node *ni, int reason)
 {
 	switch (reason) {
 	case IEEE80211_STATUS_SUCCESS:		/* NB: MLME assoc */
 	case IEEE80211_STATUS_TIMEOUT:
 	case IEEE80211_REASON_ASSOC_EXPIRE:
 	case IEEE80211_REASON_NOT_AUTHED:
 	case IEEE80211_REASON_NOT_ASSOCED:
 	case IEEE80211_REASON_ASSOC_LEAVE:
 	case IEEE80211_REASON_ASSOC_NOT_AUTHED:
 		IEEE80211_SEND_MGMT(ni, IEEE80211_FC0_SUBTYPE_AUTH, 1);
 		break;
 	default:
 		ieee80211_scan_assoc_fail(vap, vap->iv_bss->ni_macaddr, reason);
 		if (vap->iv_roaming == IEEE80211_ROAMING_AUTO)
 			ieee80211_check_scan_current(vap);
 		break;
 	}
 }
 
 static void
 sta_swbmiss_start(struct ieee80211vap *vap)
 {
 
 	if (vap->iv_flags_ext & IEEE80211_FEXT_SWBMISS) {
 		/*
 		 * Start s/w beacon miss timer for devices w/o
 		 * hardware support.  We fudge a bit here since
 		 * we're doing this in software.
 		 */
 		vap->iv_swbmiss_period = IEEE80211_TU_TO_TICKS(
 		    2 * vap->iv_bmissthreshold * vap->iv_bss->ni_intval);
 		vap->iv_swbmiss_count = 0;
 		callout_reset(&vap->iv_swbmiss, vap->iv_swbmiss_period,
 		    ieee80211_swbmiss, vap);
 	}
 }
 
 /*
  * IEEE80211_M_STA vap state machine handler.
  * This routine handles the main states in the 802.11 protocol.
  */
 static int
 sta_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_node *ni;
 	enum ieee80211_state ostate;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	ostate = vap->iv_state;
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s -> %s (%d)\n",
 	    __func__, ieee80211_state_name[ostate],
 	    ieee80211_state_name[nstate], arg);
 	vap->iv_state = nstate;			/* state transition */
 	callout_stop(&vap->iv_mgtsend);		/* XXX callout_drain */
 	if (ostate != IEEE80211_S_SCAN)
 		ieee80211_cancel_scan(vap);	/* background scan */
 	ni = vap->iv_bss;			/* NB: no reference held */
 	if (vap->iv_flags_ext & IEEE80211_FEXT_SWBMISS)
 		callout_stop(&vap->iv_swbmiss);
 	switch (nstate) {
 	case IEEE80211_S_INIT:
 		switch (ostate) {
 		case IEEE80211_S_SLEEP:
 			/* XXX wakeup */
 			/* XXX driver hook to wakeup the hardware? */
 		case IEEE80211_S_RUN:
 			IEEE80211_SEND_MGMT(ni,
 			    IEEE80211_FC0_SUBTYPE_DISASSOC,
 			    IEEE80211_REASON_ASSOC_LEAVE);
 			ieee80211_sta_leave(ni);
 			break;
 		case IEEE80211_S_ASSOC:
 			IEEE80211_SEND_MGMT(ni,
 			    IEEE80211_FC0_SUBTYPE_DEAUTH,
 			    IEEE80211_REASON_AUTH_LEAVE);
 			break;
 		case IEEE80211_S_SCAN:
 			ieee80211_cancel_scan(vap);
 			break;
 		default:
 			break;
 		}
 		if (ostate != IEEE80211_S_INIT) {
 			/* NB: optimize INIT -> INIT case */
 			ieee80211_reset_bss(vap);
 		}
 		if (vap->iv_auth->ia_detach != NULL)
 			vap->iv_auth->ia_detach(vap);
 		break;
 	case IEEE80211_S_SCAN:
 		switch (ostate) {
 		case IEEE80211_S_INIT:
 			/*
 			 * Initiate a scan.  We can come here as a result
 			 * of an IEEE80211_IOC_SCAN_REQ too in which case
 			 * the vap will be marked with IEEE80211_FEXT_SCANREQ
 			 * and the scan request parameters will be present
 			 * in iv_scanreq.  Otherwise we do the default.
 			 */
 			if (vap->iv_flags_ext & IEEE80211_FEXT_SCANREQ) {
 				ieee80211_check_scan(vap,
 				    vap->iv_scanreq_flags,
 				    vap->iv_scanreq_duration,
 				    vap->iv_scanreq_mindwell,
 				    vap->iv_scanreq_maxdwell,
 				    vap->iv_scanreq_nssid, vap->iv_scanreq_ssid);
 				vap->iv_flags_ext &= ~IEEE80211_FEXT_SCANREQ;
 			} else
 				ieee80211_check_scan_current(vap);
 			break;
 		case IEEE80211_S_SCAN:
 		case IEEE80211_S_AUTH:
 		case IEEE80211_S_ASSOC:
 			/*
 			 * These can happen either because of a timeout
 			 * on an assoc/auth response or because of a
 			 * change in state that requires a reset.  For
 			 * the former we're called with a non-zero arg
 			 * that is the cause for the failure; pass this
 			 * to the scan code so it can update state.
 			 * Otherwise trigger a new scan unless we're in
 			 * manual roaming mode in which case an application
 			 * must issue an explicit scan request.
 			 */
 			if (arg != 0)
 				ieee80211_scan_assoc_fail(vap,
 					vap->iv_bss->ni_macaddr, arg);
 			if (vap->iv_roaming == IEEE80211_ROAMING_AUTO)
 				ieee80211_check_scan_current(vap);
 			break;
 		case IEEE80211_S_SLEEP:		/* beacon miss */
 			/*
 			 * XXX if in sleep we need to wakeup the hardware.
 			 */
 			/* FALLTHROUGH */
 		case IEEE80211_S_RUN:		/* beacon miss */
 			/*
 			 * Beacon miss.  Notify user space and if not
 			 * under control of a user application (roaming
 			 * manual) kick off a scan to re-connect.
 			 */
 
 			ieee80211_sta_leave(ni);
 			if (vap->iv_roaming == IEEE80211_ROAMING_AUTO)
 				ieee80211_check_scan_current(vap);
 			break;
 		default:
 			goto invalid;
 		}
 		break;
 	case IEEE80211_S_AUTH:
 		switch (ostate) {
 		case IEEE80211_S_INIT:
 		case IEEE80211_S_SCAN:
 			IEEE80211_SEND_MGMT(ni,
 			    IEEE80211_FC0_SUBTYPE_AUTH, 1);
 			break;
 		case IEEE80211_S_AUTH:
 		case IEEE80211_S_ASSOC:
 			switch (arg & 0xff) {
 			case IEEE80211_FC0_SUBTYPE_AUTH:
 				/* ??? */
 				IEEE80211_SEND_MGMT(ni,
 				    IEEE80211_FC0_SUBTYPE_AUTH, 2);
 				break;
 			case IEEE80211_FC0_SUBTYPE_DEAUTH:
 				sta_authretry(vap, ni, arg>>8);
 				break;
 			}
 			break;
 		case IEEE80211_S_SLEEP:
 		case IEEE80211_S_RUN:
 			switch (arg & 0xff) {
 			case IEEE80211_FC0_SUBTYPE_AUTH:
 				IEEE80211_SEND_MGMT(ni,
 				    IEEE80211_FC0_SUBTYPE_AUTH, 2);
 				vap->iv_state = IEEE80211_S_RUN; /* stay RUN */
 				break;
 			case IEEE80211_FC0_SUBTYPE_DEAUTH:
 				ieee80211_sta_leave(ni);
 				if (vap->iv_roaming == IEEE80211_ROAMING_AUTO) {
 					/* try to reauth */
 					IEEE80211_SEND_MGMT(ni,
 					    IEEE80211_FC0_SUBTYPE_AUTH, 1);
 				}
 				break;
 			}
 			break;
 		default:
 			goto invalid;
 		}
 		break;
 	case IEEE80211_S_ASSOC:
 		switch (ostate) {
 		case IEEE80211_S_AUTH:
 		case IEEE80211_S_ASSOC:
 			IEEE80211_SEND_MGMT(ni,
 			    IEEE80211_FC0_SUBTYPE_ASSOC_REQ, 0);
 			break;
 		case IEEE80211_S_SLEEP:		/* cannot happen */
 		case IEEE80211_S_RUN:
 			ieee80211_sta_leave(ni);
 			if (vap->iv_roaming == IEEE80211_ROAMING_AUTO) {
 				IEEE80211_SEND_MGMT(ni, arg ?
 				    IEEE80211_FC0_SUBTYPE_REASSOC_REQ :
 				    IEEE80211_FC0_SUBTYPE_ASSOC_REQ, 0);
 			}
 			break;
 		default:
 			goto invalid;
 		}
 		break;
 	case IEEE80211_S_RUN:
 		if (vap->iv_flags & IEEE80211_F_WPA) {
 			/* XXX validate prerequisites */
 		}
 		switch (ostate) {
 		case IEEE80211_S_RUN:
 		case IEEE80211_S_CSA:
 			break;
 		case IEEE80211_S_AUTH:		/* when join is done in fw */
 		case IEEE80211_S_ASSOC:
 #ifdef IEEE80211_DEBUG
 			if (ieee80211_msg_debug(vap)) {
 				ieee80211_note(vap, "%s with %s ssid ",
 				    (vap->iv_opmode == IEEE80211_M_STA ?
 				    "associated" : "synchronized"),
 				    ether_sprintf(ni->ni_bssid));
 				ieee80211_print_essid(vap->iv_bss->ni_essid,
 				    ni->ni_esslen);
 				/* XXX MCS/HT */
 				printf(" channel %d start %uMb\n",
 				    ieee80211_chan2ieee(ic, ic->ic_curchan),
 				    IEEE80211_RATE2MBS(ni->ni_txrate));
 			}
 #endif
 			ieee80211_scan_assoc_success(vap, ni->ni_macaddr);
 			ieee80211_notify_node_join(ni, 
 			    arg == IEEE80211_FC0_SUBTYPE_ASSOC_RESP);
 			break;
 		case IEEE80211_S_SLEEP:
 			/* Wake up from sleep */
 			vap->iv_sta_ps(vap, 0);
 			break;
 		default:
 			goto invalid;
 		}
 		ieee80211_sync_curchan(ic);
 		if (ostate != IEEE80211_S_RUN)
 			sta_swbmiss_start(vap);
 		/*
 		 * When 802.1x is not in use mark the port authorized
 		 * at this point so traffic can flow.
 		 */
 		if (ni->ni_authmode != IEEE80211_AUTH_8021X)
 			ieee80211_node_authorize(ni);
 		/*
 		 * Fake association when joining an existing bss.
 		 *
 		 * Don't do this if we're doing SLEEP->RUN.
 		 */
 		if (ic->ic_newassoc != NULL && ostate != IEEE80211_S_SLEEP)
 			ic->ic_newassoc(vap->iv_bss, (ostate != IEEE80211_S_RUN));
 		break;
 	case IEEE80211_S_CSA:
 		if (ostate != IEEE80211_S_RUN)
 			goto invalid;
 		break;
 	case IEEE80211_S_SLEEP:
 		sta_swbmiss_start(vap);
 		vap->iv_sta_ps(vap, 1);
 		break;
 	default:
 	invalid:
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE,
 		    "%s: unexpected state transition %s -> %s\n", __func__,
 		    ieee80211_state_name[ostate], ieee80211_state_name[nstate]);
 		break;
 	}
 	return 0;
 }
 
 /*
  * Return non-zero if the frame is an echo of a multicast
  * frame sent by ourself.  The dir is known to be DSTODS.
  */
 static __inline int
 isdstods_mcastecho(struct ieee80211vap *vap, const struct ieee80211_frame *wh)
 {
 #define	QWH4(wh)	((const struct ieee80211_qosframe_addr4 *)wh)
 #define	WH4(wh)		((const struct ieee80211_frame_addr4 *)wh)
 	const uint8_t *sa;
 
 	KASSERT(vap->iv_opmode == IEEE80211_M_STA, ("wrong mode"));
 
 	if (!IEEE80211_IS_MULTICAST(wh->i_addr3))
 		return 0;
 	sa = IEEE80211_QOS_HAS_SEQ(wh) ? QWH4(wh)->i_addr4 : WH4(wh)->i_addr4;
 	return IEEE80211_ADDR_EQ(sa, vap->iv_myaddr);
 #undef WH4
 #undef QWH4
 }
 
 /*
  * Return non-zero if the frame is an echo of a multicast
  * frame sent by ourself.  The dir is known to be FROMDS.
  */
 static __inline int
 isfromds_mcastecho(struct ieee80211vap *vap, const struct ieee80211_frame *wh)
 {
 	KASSERT(vap->iv_opmode == IEEE80211_M_STA, ("wrong mode"));
 
 	if (!IEEE80211_IS_MULTICAST(wh->i_addr1))
 		return 0;
 	return IEEE80211_ADDR_EQ(wh->i_addr3, vap->iv_myaddr);
 }
 
 /*
  * Decide if a received management frame should be
  * printed when debugging is enabled.  This filters some
  * of the less interesting frames that come frequently
  * (e.g. beacons).
  */
 static __inline int
 doprint(struct ieee80211vap *vap, int subtype)
 {
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_BEACON:
 		return (vap->iv_ic->ic_flags & IEEE80211_F_SCAN);
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 		return 0;
 	}
 	return 1;
 }
 
 /*
  * Process a received frame.  The node associated with the sender
  * should be supplied.  If nothing was found in the node table then
  * the caller is assumed to supply a reference to iv_bss instead.
  * The RSSI and a timestamp are also supplied.  The RSSI data is used
  * during AP scanning to select a AP to associate with; it can have
  * any units so long as values have consistent units and higher values
  * mean ``better signal''.  The receive timestamp is currently not used
  * by the 802.11 layer.
  */
 static int
 sta_input(struct ieee80211_node *ni, struct mbuf *m,
     const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_frame *wh;
 	struct ieee80211_key *key;
 	struct ether_header *eh;
 	int hdrspace, need_tap = 1;	/* mbuf need to be tapped. */
 	uint8_t dir, type, subtype, qos;
 	uint8_t *bssid;
 	int is_hw_decrypted = 0;
 	int has_decrypted = 0;
 
 	KASSERT(ni != NULL, ("%s: null node, mbuf %p", __func__, m));
 
 	/* Early init in case of early error case. */
 	type = -1;
 
 	/*
 	 * Bit of a cheat here, we use a pointer for a 3-address
 	 * frame format but don't reference fields past outside
 	 * ieee80211_frame_min (or other shorter frames) w/o first
 	 * validating the data is present.
 	 */
 	wh = mtod(m, struct ieee80211_frame *);
 
 	if (m->m_pkthdr.len < 2 || m->m_pkthdr.len < ieee80211_anyhdrsize(wh)) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL,
 		    "too short (1): len %u", m->m_pkthdr.len);
 		vap->iv_stats.is_rx_tooshort++;
 		goto err;
 	}
 	if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) !=
 	    IEEE80211_FC0_VERSION_0) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL, "wrong version, fc %02x:%02x",
 		    wh->i_fc[0], wh->i_fc[1]);
 		vap->iv_stats.is_rx_badversion++;
 		goto err;
 	}
 
 	/*
 	 * Some devices do hardware decryption all the way through
 	 * to pretending the frame wasn't encrypted in the first place.
 	 * So, tag it appropriately so it isn't discarded inappropriately.
 	 */
 	if ((rxs != NULL) && (rxs->c_pktflags & IEEE80211_RX_F_DECRYPTED))
 		is_hw_decrypted = 1;
 
 	if (m->m_flags & M_AMPDU_MPDU) {
 		/*
 		 * Fastpath for A-MPDU reorder q resubmission.  Frames
 		 * w/ M_AMPDU_MPDU marked have already passed through
 		 * here but were received out of order and been held on
 		 * the reorder queue.  When resubmitted they are marked
 		 * with the M_AMPDU_MPDU flag and we can bypass most of
 		 * the normal processing.
 		 */
 		type = IEEE80211_FC0_TYPE_DATA;
 		dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 		subtype = IEEE80211_FC0_SUBTYPE_QOS_DATA;
 		hdrspace = ieee80211_hdrspace(ic, wh);	/* XXX optimize? */
 		goto resubmit_ampdu;
 	}
 
 	ni->ni_inact = ni->ni_inact_reload;
 
 	dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 	type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK;
 	subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
 	/*
 	 * Control frames are not folowing the header scheme of data and mgmt
 	 * frames so we do not apply extra checks here.
 	 * We probably should do checks on RA (+TA) where available for those
 	 * too, but for now do not drop them.
 	 */
 	if (type != IEEE80211_FC0_TYPE_CTL &&
 	    (ic->ic_flags & IEEE80211_F_SCAN) == 0) {
 		bssid = wh->i_addr2;
 		if (!IEEE80211_ADDR_EQ(bssid, ni->ni_bssid)) {
 			/* not interested in */
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    bssid, NULL, "%s", "not to bss");
 			vap->iv_stats.is_rx_wrongbss++;
 			goto out;
 		}
 
 		/*
 		 * Some devices may be in a promiscuous mode
 		 * where they receive frames for multiple station
 		 * addresses.
 		 *
 		 * If we receive a data frame that isn't
 		 * destined to our VAP MAC, drop it.
 		 *
 		 * XXX TODO: This is only enforced when not scanning;
 		 * XXX it assumes a software-driven scan will put the NIC
 		 * XXX into a "no data frames" mode before setting this
 		 * XXX flag. Otherwise it may be possible that we'll still
 		 * XXX process data frames whilst scanning.
 		 */
 		if ((! IEEE80211_IS_MULTICAST(wh->i_addr1))
 		    && (! IEEE80211_ADDR_EQ(wh->i_addr1, IF_LLADDR(ifp)))) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    bssid, NULL, "not to cur sta: lladdr=%6D, addr1=%6D",
 			    IF_LLADDR(ifp), ":", wh->i_addr1, ":");
 			vap->iv_stats.is_rx_wrongbss++;
 			goto out;
 		}
 
 		IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi);
 		ni->ni_noise = nf;
 		if ( IEEE80211_HAS_SEQ(type, subtype) &&
 		    !IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			uint8_t tid = ieee80211_gettid(wh);
 			if (IEEE80211_QOS_HAS_SEQ(wh) &&
 			    TID_TO_WME_AC(tid) >= WME_AC_VI)
 				ic->ic_wme.wme_hipri_traffic++;
 			if (! ieee80211_check_rxseq(ni, wh, bssid, rxs))
 				goto out;
 		}
 	}
 
 	switch (type) {
 	case IEEE80211_FC0_TYPE_DATA:
 		hdrspace = ieee80211_hdrspace(ic, wh);
 		if (m->m_len < hdrspace &&
 		    (m = m_pullup(m, hdrspace)) == NULL) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, NULL,
 			    "data too short: expecting %u", hdrspace);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;		/* XXX */
 		}
 		/*
 		 * Handle A-MPDU re-ordering.  If the frame is to be
 		 * processed directly then ieee80211_ampdu_reorder
 		 * will return 0; otherwise it has consumed the mbuf
 		 * and we should do nothing more with it.
 		 */
 		if ((m->m_flags & M_AMPDU) &&
 		    (dir == IEEE80211_FC1_DIR_FROMDS ||
 		     dir == IEEE80211_FC1_DIR_DSTODS) &&
 		    ieee80211_ampdu_reorder(ni, m, rxs) != 0) {
 			m = NULL;
 			goto out;
 		}
 	resubmit_ampdu:
 		if (dir == IEEE80211_FC1_DIR_FROMDS) {
 			if ((ifp->if_flags & IFF_SIMPLEX) &&
 			    isfromds_mcastecho(vap, wh)) {
 				/*
 				 * In IEEE802.11 network, multicast
 				 * packets sent from "me" are broadcast
 				 * from the AP; silently discard for
 				 * SIMPLEX interface.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 				    wh, "data", "%s", "multicast echo");
 				vap->iv_stats.is_rx_mcastecho++;
 				goto out;
 			}
 			if ((vap->iv_flags & IEEE80211_F_DWDS) &&
 			    IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 				/*
 				 * DWDS sta's must drop 3-address mcast frames
 				 * as they will be sent separately as a 4-addr
 				 * frame.  Accepting the 3-addr frame will
 				 * confuse the bridge into thinking the sending
 				 * sta is located at the end of WDS link.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh,
 				    "3-address data", "%s", "DWDS enabled");
 				vap->iv_stats.is_rx_mcastecho++;
 				goto out;
 			}
 		} else if (dir == IEEE80211_FC1_DIR_DSTODS) {
 			if ((vap->iv_flags & IEEE80211_F_DWDS) == 0) {
 				IEEE80211_DISCARD(vap,
 				    IEEE80211_MSG_INPUT, wh, "4-address data",
 				    "%s", "DWDS not enabled");
 				vap->iv_stats.is_rx_wrongdir++;
 				goto out;
 			}
 			if ((ifp->if_flags & IFF_SIMPLEX) &&
 			    isdstods_mcastecho(vap, wh)) {
 				/*
 				 * In IEEE802.11 network, multicast
 				 * packets sent from "me" are broadcast
 				 * from the AP; silently discard for
 				 * SIMPLEX interface.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh,
 				    "4-address data", "%s", "multicast echo");
 				vap->iv_stats.is_rx_mcastecho++;
 				goto out;
 			}
 		} else {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT, wh,
 			    "data", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto out;
 		}
 
 		/*
 		 * Handle privacy requirements for hardware decryption
 		 * devices.
 		 *
 		 * For those devices, a handful of things happen.
 		 *
 		 * + If IV has been stripped, then we can't run
 		 *   ieee80211_crypto_decap() - none of the key
 		 * + If MIC has been stripped, we can't validate
 		 *   MIC here.
 		 * + If MIC fails, then we need to communicate a
 		 *   MIC failure up to the stack - but we don't know
 		 *   which key was used.
 		 */
 
 		/*
 		 * Handle privacy requirements.  Note that we
 		 * must not be preempted from here until after
 		 * we (potentially) call ieee80211_crypto_demic;
 		 * otherwise we may violate assumptions in the
 		 * crypto cipher modules used to do delayed update
 		 * of replay sequence numbers.
 		 */
 		if (is_hw_decrypted || IEEE80211_IS_PROTECTED(wh)) {
 			if ((vap->iv_flags & IEEE80211_F_PRIVACY) == 0) {
 				/*
 				 * Discard encrypted frames when privacy is off.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 				    wh, "WEP", "%s", "PRIVACY off");
 				vap->iv_stats.is_rx_noprivacy++;
 				IEEE80211_NODE_STAT(ni, rx_noprivacy);
 				goto out;
 			}
 			if (ieee80211_crypto_decap(ni, m, hdrspace, &key) == 0) {
 				/* NB: stats+msgs handled in crypto_decap */
 				IEEE80211_NODE_STAT(ni, rx_wepfail);
 				goto out;
 			}
 			wh = mtod(m, struct ieee80211_frame *);
 			wh->i_fc[1] &= ~IEEE80211_FC1_PROTECTED;
 			has_decrypted = 1;
 		} else {
 			/* XXX M_WEP and IEEE80211_F_PRIVACY */
 			key = NULL;
 		}
 
 		/*
 		 * Save QoS bits for use below--before we strip the header.
 		 */
 		if (subtype == IEEE80211_FC0_SUBTYPE_QOS_DATA)
 			qos = ieee80211_getqos(wh)[0];
 		else
 			qos = 0;
 
 		/*
 		 * Next up, any fragmentation.
 		 */
 		if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			m = ieee80211_defrag(ni, m, hdrspace, has_decrypted);
 			if (m == NULL) {
 				/* Fragment dropped or frame not complete yet */
 				goto out;
 			}
 		}
 		wh = NULL;		/* no longer valid, catch any uses */
 
 		/*
 		 * Next strip any MSDU crypto bits.
 		 *
 		 * Note: we can't do MIC stripping/verification if the
 		 * upper layer has stripped it.  We have to check MIC
 		 * ourselves.  So, key may be NULL, but we have to check
 		 * the RX status.
 		 */
 		if (!ieee80211_crypto_demic(vap, key, m, 0)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "demic error");
 			vap->iv_stats.is_rx_demicfail++;
 			IEEE80211_NODE_STAT(ni, rx_demicfail);
 			goto out;
 		}
 
 		/* copy to listener after decrypt */
 		if (ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		need_tap = 0;
 
 		/*
 		 * Finally, strip the 802.11 header.
 		 */
 		m = ieee80211_decap(vap, m, hdrspace, qos);
 		if (m == NULL) {
 			/* XXX mask bit to check for both */
 			/* don't count Null data frames as errors */
 			if (subtype == IEEE80211_FC0_SUBTYPE_NODATA ||
 			    subtype == IEEE80211_FC0_SUBTYPE_QOS_NULL)
 				goto out;
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "decap error");
 			vap->iv_stats.is_rx_decap++;
 			IEEE80211_NODE_STAT(ni, rx_decap);
 			goto err;
 		}
 		if (!(qos & IEEE80211_QOS_AMSDU))
 			eh = mtod(m, struct ether_header *);
 		else
 			eh = NULL;
 		if (!ieee80211_node_is_authorized(ni)) {
 			/*
 			 * Deny any non-PAE frames received prior to
 			 * authorization.  For open/shared-key
 			 * authentication the port is mark authorized
 			 * after authentication completes.  For 802.1x
 			 * the port is not marked authorized by the
 			 * authenticator until the handshake has completed.
 			 */
 			if (eh == NULL ||
 			    eh->ether_type != htons(ETHERTYPE_PAE)) {
 				IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 				    ni->ni_macaddr, "data", "unauthorized or "
 				    "unknown port: ether type 0x%x len %u",
 				    eh == NULL ? -1 : eh->ether_type,
 				    m->m_pkthdr.len);
 				vap->iv_stats.is_rx_unauth++;
 				IEEE80211_NODE_STAT(ni, rx_unauth);
 				goto err;
 			}
 		} else {
 			/*
 			 * When denying unencrypted frames, discard
 			 * any non-PAE frames received without encryption.
 			 */
 			if ((vap->iv_flags & IEEE80211_F_DROPUNENC) &&
 			    ((has_decrypted == 0) && (m->m_flags & M_WEP) == 0) &&
 			    (is_hw_decrypted == 0) &&
 			    (eh == NULL ||
 			     eh->ether_type != htons(ETHERTYPE_PAE))) {
 				/*
 				 * Drop unencrypted frames.
 				 */
 				vap->iv_stats.is_rx_unencrypted++;
 				IEEE80211_NODE_STAT(ni, rx_unencrypted);
 				goto out;
 			}
 		}
 		/* XXX require HT? */
 		if (qos & IEEE80211_QOS_AMSDU) {
 			m = ieee80211_decap_amsdu(ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 		} else {
 #ifdef IEEE80211_SUPPORT_SUPERG
 			m = ieee80211_decap_fastframe(vap, ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 #endif
 		}
 		ieee80211_deliver_data(vap, ni, m);
 		return IEEE80211_FC0_TYPE_DATA;
 
 	case IEEE80211_FC0_TYPE_MGT:
 		vap->iv_stats.is_rx_mgmt++;
 		IEEE80211_NODE_STAT(ni, rx_mgmt);
 		if (dir != IEEE80211_FC1_DIR_NODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto err;
 		}
 		if (m->m_pkthdr.len < sizeof(struct ieee80211_frame)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, "mgt", "too short: len %u",
 			    m->m_pkthdr.len);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;
 		}
 #ifdef IEEE80211_DEBUG
 		if ((ieee80211_msg_debug(vap) && doprint(vap, subtype)) ||
 		    ieee80211_msg_dumppkts(vap)) {
 			if_printf(ifp, "received %s from %s rssi %d\n",
 			    ieee80211_mgt_subtype_name(subtype),
 			    ether_sprintf(wh->i_addr2), rssi);
 		}
 #endif
 
 		/*
 		 * Note: See above for hardware offload privacy requirements.
 		 *       It also applies here.
 		 */
 
 		/*
 		 * Again, having encrypted flag set check would be good, but
 		 * then we have to also handle crypto_decap() like above.
 		 */
 		if (IEEE80211_IS_PROTECTED(wh)) {
 			if (subtype != IEEE80211_FC0_SUBTYPE_AUTH) {
 				/*
 				 * Only shared key auth frames with a challenge
 				 * should be encrypted, discard all others.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 				    wh, ieee80211_mgt_subtype_name(subtype),
 				    "%s", "WEP set but not permitted");
 				vap->iv_stats.is_rx_mgtdiscard++; /* XXX */
 				goto out;
 			}
 			if ((vap->iv_flags & IEEE80211_F_PRIVACY) == 0) {
 				/*
 				 * Discard encrypted frames when privacy is off.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 				    wh, "mgt", "%s", "WEP set but PRIVACY off");
 				vap->iv_stats.is_rx_noprivacy++;
 				goto out;
 			}
 			hdrspace = ieee80211_hdrspace(ic, wh);
 
 			/*
 			 * Again, if IV/MIC was stripped, then this whole
 			 * setup will fail.  That's going to need some poking.
 			 */
 			if (ieee80211_crypto_decap(ni, m, hdrspace, &key) == 0) {
 				/* NB: stats+msgs handled in crypto_decap */
 				goto out;
 			}
 			has_decrypted = 1;
 			wh = mtod(m, struct ieee80211_frame *);
 			wh->i_fc[1] &= ~IEEE80211_FC1_PROTECTED;
 		}
 		vap->iv_recv_mgmt(ni, m, subtype, rxs, rssi, nf);
 		goto out;
 
 	case IEEE80211_FC0_TYPE_CTL:
 		vap->iv_stats.is_rx_ctl++;
 		IEEE80211_NODE_STAT(ni, rx_ctrl);
 		vap->iv_recv_ctl(ni, m, subtype);
 		goto out;
 
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, NULL, "bad frame type 0x%x", type);
 		/* should not come here */
 		break;
 	}
 err:
 	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 out:
 	if (m != NULL) {
 		if (need_tap && ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		m_freem(m);
 	}
 	return type;
 }
 
 static void
 sta_auth_open(struct ieee80211_node *ni, struct ieee80211_frame *wh,
     int rssi, int nf, uint16_t seq, uint16_t status)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 
 	if (ni->ni_authmode == IEEE80211_AUTH_SHARED) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 		    ni->ni_macaddr, "open auth",
 		    "bad sta auth mode %u", ni->ni_authmode);
 		vap->iv_stats.is_rx_bad_auth++;	/* XXX */
 		return;
 	}
 	if (vap->iv_state != IEEE80211_S_AUTH ||
 	    seq != IEEE80211_AUTH_OPEN_RESPONSE) {
 		vap->iv_stats.is_rx_bad_auth++;
 		return;
 	}
 	if (status != 0) {
 		IEEE80211_NOTE(vap, IEEE80211_MSG_DEBUG | IEEE80211_MSG_AUTH,
 		    ni, "open auth failed (reason %d)", status);
 		vap->iv_stats.is_rx_auth_fail++;
 		vap->iv_stats.is_rx_authfail_code = status;
 		ieee80211_new_state(vap, IEEE80211_S_SCAN,
 		    IEEE80211_SCAN_FAIL_STATUS);
 	} else
 		ieee80211_new_state(vap, IEEE80211_S_ASSOC, 0);
 }
 
 static void
 sta_auth_shared(struct ieee80211_node *ni, struct ieee80211_frame *wh,
     uint8_t *frm, uint8_t *efrm, int rssi, int nf,
     uint16_t seq, uint16_t status)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	uint8_t *challenge;
 
 	/*
 	 * NB: this can happen as we allow pre-shared key
 	 * authentication to be enabled w/o wep being turned
 	 * on so that configuration of these can be done
 	 * in any order.  It may be better to enforce the
 	 * ordering in which case this check would just be
 	 * for sanity/consistency.
 	 */
 	if ((vap->iv_flags & IEEE80211_F_PRIVACY) == 0) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 		    ni->ni_macaddr, "shared key auth",
 		    "%s", " PRIVACY is disabled");
 		goto bad;
 	}
 	/*
 	 * Pre-shared key authentication is evil; accept
 	 * it only if explicitly configured (it is supported
 	 * mainly for compatibility with clients like OS X).
 	 */
 	if (ni->ni_authmode != IEEE80211_AUTH_AUTO &&
 	    ni->ni_authmode != IEEE80211_AUTH_SHARED) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 		    ni->ni_macaddr, "shared key auth",
 		    "bad sta auth mode %u", ni->ni_authmode);
 		vap->iv_stats.is_rx_bad_auth++;	/* XXX maybe a unique error? */
 		goto bad;
 	}
 
 	challenge = NULL;
 	if (frm + 1 < efrm) {
 		if ((frm[1] + 2) > (efrm - frm)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 			    ni->ni_macaddr, "shared key auth",
 			    "ie %d/%d too long",
 			    frm[0], (frm[1] + 2) - (efrm - frm));
 			vap->iv_stats.is_rx_bad_auth++;
 			goto bad;
 		}
 		if (*frm == IEEE80211_ELEMID_CHALLENGE)
 			challenge = frm;
 		frm += frm[1] + 2;
 	}
 	switch (seq) {
 	case IEEE80211_AUTH_SHARED_CHALLENGE:
 	case IEEE80211_AUTH_SHARED_RESPONSE:
 		if (challenge == NULL) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 			    ni->ni_macaddr, "shared key auth",
 			    "%s", "no challenge");
 			vap->iv_stats.is_rx_bad_auth++;
 			goto bad;
 		}
 		if (challenge[1] != IEEE80211_CHALLENGE_LEN) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_AUTH,
 			    ni->ni_macaddr, "shared key auth",
 			    "bad challenge len %d", challenge[1]);
 			vap->iv_stats.is_rx_bad_auth++;
 			goto bad;
 		}
 	default:
 		break;
 	}
 	if (vap->iv_state != IEEE80211_S_AUTH)
 		return;
 	switch (seq) {
 	case IEEE80211_AUTH_SHARED_PASS:
 		if (ni->ni_challenge != NULL) {
 			IEEE80211_FREE(ni->ni_challenge, M_80211_NODE);
 			ni->ni_challenge = NULL;
 		}
 		if (status != 0) {
 			IEEE80211_NOTE_FRAME(vap,
 			    IEEE80211_MSG_DEBUG | IEEE80211_MSG_AUTH, wh,
 			    "shared key auth failed (reason %d)", status);
 			vap->iv_stats.is_rx_auth_fail++;
 			vap->iv_stats.is_rx_authfail_code = status;
 			return;
 		}
 		ieee80211_new_state(vap, IEEE80211_S_ASSOC, 0);
 		break;
 	case IEEE80211_AUTH_SHARED_CHALLENGE:
 		if (!ieee80211_alloc_challenge(ni))
 			return;
 		/* XXX could optimize by passing recvd challenge */
 		memcpy(ni->ni_challenge, &challenge[2], challenge[1]);
 		IEEE80211_SEND_MGMT(ni,
 			IEEE80211_FC0_SUBTYPE_AUTH, seq + 1);
 		break;
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_AUTH,
 		    wh, "shared key auth", "bad seq %d", seq);
 		vap->iv_stats.is_rx_bad_auth++;
 		return;
 	}
 	return;
 bad:
 	/*
 	 * Kick the state machine.  This short-circuits
 	 * using the mgt frame timeout to trigger the
 	 * state transition.
 	 */
 	if (vap->iv_state == IEEE80211_S_AUTH)
 		ieee80211_new_state(vap, IEEE80211_S_SCAN,
 		    IEEE80211_SCAN_FAIL_STATUS);
 }
 
 /*
  * Parse the WME IE for QoS and U-APSD information.
  *
  * Returns -1 if the IE isn't found, 1 if it's found.
  */
 int
 ieee80211_parse_wmeie(uint8_t *frm, const struct ieee80211_frame *wh,
     struct ieee80211_node *ni)
 {
 	u_int len = frm[1];
 
 	ni->ni_uapsd = 0;
 
 	if (len < sizeof(struct ieee80211_wme_param)-2) {
 		IEEE80211_DISCARD_IE(ni->ni_vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WME,
 		    wh, "WME", "too short, len %u", len);
 		return -1;
 	}
 
 	ni->ni_uapsd = frm[WME_CAPINFO_IE_OFFSET];
 
 	IEEE80211_NOTE(ni->ni_vap, IEEE80211_MSG_POWER | IEEE80211_MSG_ASSOC,
 	    ni, "U-APSD settings from STA: 0x%02x", ni->ni_uapsd);
 
 	return 1;
 }
 
 int
 ieee80211_parse_wmeparams(struct ieee80211vap *vap, uint8_t *frm,
 	const struct ieee80211_frame *wh, uint8_t *qosinfo)
 {
 	struct ieee80211_wme_state *wme = &vap->iv_ic->ic_wme;
 	u_int len = frm[1], qosinfo_count;
 	int i;
 
 	*qosinfo = 0;
 
 	if (len < sizeof(struct ieee80211_wme_param)-2) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_WME,
 		    wh, "WME", "too short, len %u", len);
 		return -1;
 	}
 	*qosinfo = frm[__offsetof(struct ieee80211_wme_param, param_qosInfo)];
 	qosinfo_count = *qosinfo & WME_QOSINFO_COUNT;
 
 	/* XXX do proper check for wraparound */
 	if (qosinfo_count == wme->wme_wmeChanParams.cap_info)
 		return 0;
 	frm += __offsetof(struct ieee80211_wme_param, params_acParams);
 	for (i = 0; i < WME_NUM_AC; i++) {
 		struct wmeParams *wmep =
 			&wme->wme_wmeChanParams.cap_wmeParams[i];
 		/* NB: ACI not used */
 		wmep->wmep_acm = _IEEE80211_MASKSHIFT(frm[0], WME_PARAM_ACM);
 		wmep->wmep_aifsn =
 		    _IEEE80211_MASKSHIFT(frm[0], WME_PARAM_AIFSN);
 		wmep->wmep_logcwmin =
 		     _IEEE80211_MASKSHIFT(frm[1], WME_PARAM_LOGCWMIN);
 		wmep->wmep_logcwmax =
 		     _IEEE80211_MASKSHIFT(frm[1], WME_PARAM_LOGCWMAX);
 		wmep->wmep_txopLimit = le16dec(frm+2);
 		IEEE80211_DPRINTF(vap, IEEE80211_MSG_WME,
 		    "%s: WME: %d: acm=%d aifsn=%d logcwmin=%d logcwmax=%d txopLimit=%d\n",
 		    __func__,
 		    i,
 		    wmep->wmep_acm,
 		    wmep->wmep_aifsn,
 		    wmep->wmep_logcwmin,
 		    wmep->wmep_logcwmax,
 		    wmep->wmep_txopLimit);
 		frm += 4;
 	}
 	wme->wme_wmeChanParams.cap_info = qosinfo_count;
 	return 1;
 }
 
 /*
  * Process 11h Channel Switch Announcement (CSA) ie.  If this
  * is the first CSA then initiate the switch.  Otherwise we
  * track state and trigger completion and/or cancel of the switch.
  * XXX should be public for IBSS use
  */
 static void
 ieee80211_parse_csaparams(struct ieee80211vap *vap, uint8_t *frm,
 	const struct ieee80211_frame *wh)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	const struct ieee80211_csa_ie *csa =
 	    (const struct ieee80211_csa_ie *) frm;
 
 	KASSERT(vap->iv_state >= IEEE80211_S_RUN,
 	    ("state %s", ieee80211_state_name[vap->iv_state]));
 
 	if (csa->csa_mode > 1) {
 		IEEE80211_DISCARD_IE(vap,
 		    IEEE80211_MSG_ELEMID | IEEE80211_MSG_DOTH,
 		    wh, "CSA", "invalid mode %u", csa->csa_mode);
 		return;
 	}
 	IEEE80211_LOCK(ic);
 	if ((ic->ic_flags & IEEE80211_F_CSAPENDING) == 0) {
 		/*
 		 * Convert the channel number to a channel reference.  We
 		 * try first to preserve turbo attribute of the current
 		 * channel then fallback.  Note this will not work if the
 		 * CSA specifies a channel that requires a band switch (e.g.
 		 * 11a => 11g).  This is intentional as 11h is defined only
 		 * for 5GHz/11a and because the switch does not involve a
 		 * reassociation, protocol state (capabilities, negotated
 		 * rates, etc) may/will be wrong.
 		 */
 		struct ieee80211_channel *c =
 		    ieee80211_find_channel_byieee(ic, csa->csa_newchan,
 			(ic->ic_bsschan->ic_flags & IEEE80211_CHAN_ALLTURBO));
 		if (c == NULL) {
 			c = ieee80211_find_channel_byieee(ic,
 			    csa->csa_newchan,
 			    (ic->ic_bsschan->ic_flags & IEEE80211_CHAN_ALL));
 			if (c == NULL) {
 				IEEE80211_DISCARD_IE(vap,
 				    IEEE80211_MSG_ELEMID | IEEE80211_MSG_DOTH,
 				    wh, "CSA", "invalid channel %u",
 				    csa->csa_newchan);
 				goto done;
 			}
 		}
 #if IEEE80211_CSA_COUNT_MIN > 0
 		if (csa->csa_count < IEEE80211_CSA_COUNT_MIN) {
 			/*
 			 * Require at least IEEE80211_CSA_COUNT_MIN count to
 			 * reduce the risk of being redirected by a fabricated
 			 * CSA.  If a valid CSA is dropped we'll still get a
 			 * beacon miss when the AP leaves the channel so we'll
 			 * eventually follow to the new channel.
 			 *
 			 * NOTE: this violates the 11h spec that states that
 			 * count may be any value and if 0 then a switch
 			 * should happen asap.
 			 */
 			IEEE80211_DISCARD_IE(vap,
 			    IEEE80211_MSG_ELEMID | IEEE80211_MSG_DOTH,
 			    wh, "CSA", "count %u too small, must be >= %u",
 			    csa->csa_count, IEEE80211_CSA_COUNT_MIN);
 			goto done;
 		}
 #endif
 		ieee80211_csa_startswitch(ic, c, csa->csa_mode, csa->csa_count);
 	} else {
 		/*
 		 * Validate this ie against the initial CSA.  We require
 		 * mode and channel not change and the count must be
 		 * monotonically decreasing.  This may be pointless and
 		 * canceling the switch as a result may be too paranoid but
 		 * in the worst case if we drop out of CSA because of this
 		 * and the AP does move then we'll just end up taking a
 		 * beacon miss and scan to find the AP.
 		 *
 		 * XXX may want <= on count as we also process ProbeResp
 		 * frames and those may come in w/ the same count as the
 		 * previous beacon; but doing so leaves us open to a stuck
 		 * count until we add a dead-man timer
 		 */
 		if (!(csa->csa_count < ic->ic_csa_count &&
 		      csa->csa_mode == ic->ic_csa_mode &&
 		      csa->csa_newchan == ieee80211_chan2ieee(ic, ic->ic_csa_newchan))) {
 			IEEE80211_NOTE_FRAME(vap, IEEE80211_MSG_DOTH, wh,
 			    "CSA ie mismatch, initial ie <%d,%d,%d>, "
 			    "this ie <%d,%d,%d>", ic->ic_csa_mode,
 			    ic->ic_csa_newchan, ic->ic_csa_count,
 			    csa->csa_mode, csa->csa_newchan, csa->csa_count);
 			ieee80211_csa_cancelswitch(ic);
 		} else {
 			if (csa->csa_count <= 1)
 				ieee80211_csa_completeswitch(ic);
 			else
 				ic->ic_csa_count = csa->csa_count;
 		}
 	}
 done:
 	IEEE80211_UNLOCK(ic);
 }
 
 /*
  * Return non-zero if a background scan may be continued:
  * o bg scan is active
  * o no channel switch is pending
  * o there has not been any traffic recently
  * o no full-offload scan support (no need for explicitly continuing scan then)
  *
  * Note we do not check if there is an administrative enable;
  * this is only done to start the scan.  We assume that any
  * change in state will be accompanied by a request to cancel
  * active scans which will otherwise cause this test to fail.
  */
 static __inline int
 contbgscan(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	return ((ic->ic_flags_ext & IEEE80211_FEXT_BGSCAN) &&
 	    (ic->ic_flags & IEEE80211_F_CSAPENDING) == 0 &&
 	    !(vap->iv_flags_ext & IEEE80211_FEXT_SCAN_OFFLOAD) &&
 	    vap->iv_state == IEEE80211_S_RUN &&		/* XXX? */
 	    ieee80211_time_after(ticks, ic->ic_lastdata + vap->iv_bgscanidle));
 }
 
 /*
  * Return non-zero if a backgrond scan may be started:
  * o bg scanning is administratively enabled
  * o no channel switch is pending
  * o we are not boosted on a dynamic turbo channel
  * o there has not been a scan recently
  * o there has not been any traffic recently (don't check if full-offload scan)
  */
 static __inline int
 startbgscan(struct ieee80211vap *vap)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 
 	return ((vap->iv_flags & IEEE80211_F_BGSCAN) &&
 	    (ic->ic_flags & IEEE80211_F_CSAPENDING) == 0 &&
 #ifdef IEEE80211_SUPPORT_SUPERG
 	    !IEEE80211_IS_CHAN_DTURBO(ic->ic_curchan) &&
 #endif
 	    ieee80211_time_after(ticks, ic->ic_lastscan + vap->iv_bgscanintvl) &&
 	    ((vap->iv_flags_ext & IEEE80211_FEXT_SCAN_OFFLOAD) ||
 	     ieee80211_time_after(ticks, ic->ic_lastdata + vap->iv_bgscanidle)));
 }
 
 #ifdef	notyet
 /*
  * Compare two quiet IEs and return if they are equivalent.
  *
  * The tbttcount isnt checked - that's not part of the configuration.
  */
 static int
 compare_quiet_ie(const struct ieee80211_quiet_ie *q1,
     const struct ieee80211_quiet_ie *q2)
 {
 
 	if (q1->period != q2->period)
 		return (0);
 	if (le16dec(&q1->duration) != le16dec(&q2->duration))
 		return (0);
 	if (le16dec(&q1->offset) != le16dec(&q2->offset))
 		return (0);
 	return (1);
 }
 #endif
 
 static void
 sta_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0, int subtype,
     const struct ieee80211_rx_stats *rxs,
     int rssi, int nf)
 {
 #define	ISREASSOC(_st)	((_st) == IEEE80211_FC0_SUBTYPE_REASSOC_RESP)
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_channel *rxchan = ic->ic_curchan;
 	struct ieee80211_frame *wh;
 	int ht_state_change = 0, do_ht = 0;
 	uint8_t *frm, *efrm;
 	uint8_t *rates, *xrates, *wme, *htcap, *htinfo;
 	uint8_t *vhtcap, *vhtopmode;
 	uint8_t rate;
 	uint8_t qosinfo;
 
 	wh = mtod(m0, struct ieee80211_frame *);
 	frm = (uint8_t *)&wh[1];
 	efrm = mtod(m0, uint8_t *) + m0->m_len;
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 	case IEEE80211_FC0_SUBTYPE_BEACON: {
 		struct ieee80211_scanparams scan;
 		struct ieee80211_channel *c;
 		/*
 		 * We process beacon/probe response frames:
 		 *    o when scanning, or
 		 *    o station mode when associated (to collect state
 		 *      updates such as 802.11g slot time)
 		 * Frames otherwise received are discarded.
 		 */ 
 		if (!((ic->ic_flags & IEEE80211_F_SCAN) || ni->ni_associd)) {
 			vap->iv_stats.is_rx_mgtdiscard++;
 			return;
 		}
 
 		/* Override RX channel as appropriate */
 		if (rxs != NULL) {
 			c = ieee80211_lookup_channel_rxstatus(vap, rxs);
 			if (c != NULL)
 				rxchan = c;
 		}
 
 		/* XXX probe response in sta mode when !scanning? */
 		if (ieee80211_parse_beacon(ni, m0, rxchan, &scan) != 0) {
 			if (! (ic->ic_flags & IEEE80211_F_SCAN))
 				vap->iv_stats.is_beacon_bad++;
 			return;
 		}
 
 		/*
 		 * Count frame now that we know it's to be processed.
 		 */
 		if (subtype == IEEE80211_FC0_SUBTYPE_BEACON) {
 			vap->iv_stats.is_rx_beacon++;		/* XXX remove */
 			IEEE80211_NODE_STAT(ni, rx_beacons);
 		} else
 			IEEE80211_NODE_STAT(ni, rx_proberesp);
 		/*
 		 * When operating in station mode, check for state updates.
 		 * Be careful to ignore beacons received while doing a
 		 * background scan.  We consider only 11g/WMM stuff right now.
 		 */
 		if (ni->ni_associd != 0 &&
 		    ((ic->ic_flags & IEEE80211_F_SCAN) == 0 ||
 		     IEEE80211_ADDR_EQ(wh->i_addr2, ni->ni_bssid))) {
 			/* record tsf of last beacon */
 			memcpy(ni->ni_tstamp.data, scan.tstamp,
 				sizeof(ni->ni_tstamp));
 			/* count beacon frame for s/w bmiss handling */
 			vap->iv_swbmiss_count++;
 			vap->iv_bmiss_count = 0;
 			if (ni->ni_erp != scan.erp) {
 				IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_ASSOC,
 				    wh->i_addr2,
 				    "erp change: was 0x%x, now 0x%x",
 				    ni->ni_erp, scan.erp);
 				if (IEEE80211_IS_CHAN_ANYG(ic->ic_curchan) &&
 				    (ni->ni_erp & IEEE80211_ERP_USE_PROTECTION))
 					vap->iv_flags |= IEEE80211_F_USEPROT;
 				else
 					vap->iv_flags &= ~IEEE80211_F_USEPROT;
 				ni->ni_erp = scan.erp;
 				/* XXX statistic */
 				/* driver notification */
 				ieee80211_vap_update_erp_protmode(vap);
 			}
 			if ((ni->ni_capinfo ^ scan.capinfo) & IEEE80211_CAPINFO_SHORT_SLOTTIME) {
 				IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_ASSOC,
 				    wh->i_addr2,
 				    "capabilities change: was 0x%x, now 0x%x",
 				    ni->ni_capinfo, scan.capinfo);
 				/*
 				 * NB: we assume short preamble doesn't
 				 *     change dynamically
 				 */
 				ieee80211_vap_set_shortslottime(vap,
 					IEEE80211_IS_CHAN_A(ic->ic_bsschan) ||
 					(scan.capinfo & IEEE80211_CAPINFO_SHORT_SLOTTIME));
 				ni->ni_capinfo = (ni->ni_capinfo &~ IEEE80211_CAPINFO_SHORT_SLOTTIME)
 					       | (scan.capinfo & IEEE80211_CAPINFO_SHORT_SLOTTIME);
 				/* XXX statistic */
 			}
 			if (scan.wme != NULL &&
 			    (ni->ni_flags & IEEE80211_NODE_QOS)) {
 				int _retval;
 				if ((_retval = ieee80211_parse_wmeparams(vap,
 				    scan.wme, wh, &qosinfo)) >= 0) {
 					if (qosinfo & WME_CAPINFO_UAPSD_EN)
 						ni->ni_flags |=
 						    IEEE80211_NODE_UAPSD;
 					if (_retval > 0)
 						ieee80211_wme_updateparams(vap);
 				}
 			} else
 				ni->ni_flags &= ~IEEE80211_NODE_UAPSD;
 #ifdef IEEE80211_SUPPORT_SUPERG
 			if (scan.ath != NULL)
 				ieee80211_parse_athparams(ni, scan.ath, wh);
 #endif
 			if (scan.htcap != NULL && scan.htinfo != NULL &&
 			    (vap->iv_flags_ht & IEEE80211_FHT_HT)) {
 				/* XXX state changes? */
 				ieee80211_ht_updateparams(ni,
 				    scan.htcap, scan.htinfo);
 				do_ht = 1;
 			}
 			if (scan.vhtcap != NULL && scan.vhtopmode != NULL &&
 			    (vap->iv_flags_vht & IEEE80211_FVHT_VHT)) {
 				/* XXX state changes? */
 				ieee80211_vht_updateparams(ni,
 				    scan.vhtcap, scan.vhtopmode);
 				do_ht = 1;
 			}
 			if (do_ht) {
 				if (ieee80211_ht_updateparams_final(ni,
 				    scan.htcap, scan.htinfo))
 					ht_state_change = 1;
 			}
 
 			/*
 			 * If we have a quiet time IE then report it up to
 			 * the driver.
 			 *
 			 * Otherwise, inform the driver that the quiet time
 			 * IE has disappeared - only do that once rather than
 			 * spamming it each time.
 			 */
 			if (scan.quiet) {
 				ic->ic_set_quiet(ni, scan.quiet);
 				ni->ni_quiet_ie_set = 1;
 				memcpy(&ni->ni_quiet_ie, scan.quiet,
 				    sizeof(struct ieee80211_quiet_ie));
 			} else {
 				if (ni->ni_quiet_ie_set == 1)
 					ic->ic_set_quiet(ni, NULL);
 				ni->ni_quiet_ie_set = 0;
 				bzero(&ni->ni_quiet_ie,
 				    sizeof(struct ieee80211_quiet_ie));
 			}
 
 			if (scan.tim != NULL) {
 				struct ieee80211_tim_ie *tim =
 				    (struct ieee80211_tim_ie *) scan.tim;
 				/*
 				 * XXX Check/debug this code; see if it's about
 				 * the right time to force the VAP awake if we
 				 * receive a frame destined for us?
 				 */
 				int aid = IEEE80211_AID(ni->ni_associd);
 				int ix = aid / NBBY;
 				int min = tim->tim_bitctl &~ 1;
 				int max = tim->tim_len + min - 4;
 				int tim_ucast = 0;
 #ifdef __notyet__
 				int tim_mcast = 0;
 #endif
 
 				/*
 				 * Only do this for unicast traffic in the TIM
 				 * The multicast traffic notification for
 				 * the scan notification stuff should occur
 				 * differently.
 				 */
 				if (min <= ix && ix <= max &&
 				     isset(tim->tim_bitmap - min, aid)) {
 					tim_ucast = 1;
 				}
 
 #ifdef __notyet__
 				/*
 				 * Do a separate notification
 				 * for the multicast bit being set.
 				 */
 				if (tim->tim_bitctl & 1) {
 					tim_mcast = 1;
 				}
 #endif
 
 				/*
 				 * If the TIM indicates there's traffic for
 				 * us then get us out of STA mode powersave.
 				 */
 				if (tim_ucast == 1) {
 					/*
 					 * Wake us out of SLEEP state if we're
 					 * in it; and if we're doing bgscan
 					 * then wake us out of STA powersave.
 					 */
 					ieee80211_sta_tim_notify(vap, 1);
 
 					/*
 					 * This is preventing us from
 					 * continuing a bgscan; because it
 					 * tricks the contbgscan()
 					 * routine to think there's always
 					 * traffic for us.
 					 *
 					 * I think we need both an RX and
 					 * TX ic_lastdata field.
 					 */
 					ic->ic_lastdata = ticks;
 				}
 
 				ni->ni_dtim_count = tim->tim_count;
 				ni->ni_dtim_period = tim->tim_period;
 			}
 			if (scan.csa != NULL &&
 			    (vap->iv_flags & IEEE80211_F_DOTH))
 				ieee80211_parse_csaparams(vap, scan.csa, wh);
 			else if (ic->ic_flags & IEEE80211_F_CSAPENDING) {
 				/*
 				 * No CSA ie or 11h disabled, but a channel
 				 * switch is pending; drop out so we aren't
 				 * stuck in CSA state.  If the AP really is
 				 * moving we'll get a beacon miss and scan.
 				 */
 				IEEE80211_LOCK(ic);
 				ieee80211_csa_cancelswitch(ic);
 				IEEE80211_UNLOCK(ic);
 			}
 			/*
 			 * If scanning, pass the info to the scan module.
 			 * Otherwise, check if it's the right time to do
 			 * a background scan.  Background scanning must
 			 * be enabled and we must not be operating in the
 			 * turbo phase of dynamic turbo mode.  Then,
 			 * it's been a while since the last background
 			 * scan and if no data frames have come through
 			 * recently, kick off a scan.  Note that this
 			 * is the mechanism by which a background scan
 			 * is started _and_ continued each time we
 			 * return on-channel to receive a beacon from
 			 * our ap.
 			 */
 			if (ic->ic_flags & IEEE80211_F_SCAN) {
 				ieee80211_add_scan(vap, rxchan,
 				    &scan, wh, subtype, rssi, nf);
 			} else if (contbgscan(vap)) {
 				ieee80211_bg_scan(vap, 0);
 			} else if (startbgscan(vap)) {
 				vap->iv_stats.is_scan_bg++;
 #if 0
 				/* wakeup if we are sleeing */
 				ieee80211_set_pwrsave(vap, 0);
 #endif
 				ieee80211_bg_scan(vap, 0);
 			}
 
 			/*
 			 * Put the station to sleep if we haven't seen
 			 * traffic in a while.
 			 */
 			IEEE80211_LOCK(ic);
 			ieee80211_sta_ps_timer_check(vap);
 			IEEE80211_UNLOCK(ic);
 
 			/*
 			 * If we've had a channel width change (eg HT20<->HT40)
 			 * then schedule a delayed driver notification.
 			 */
 			if (ht_state_change)
 				ieee80211_update_chw(ic);
 			return;
 		}
 		/*
 		 * If scanning, just pass information to the scan module.
 		 */
 		if (ic->ic_flags & IEEE80211_F_SCAN) {
 			if (ic->ic_flags_ext & IEEE80211_FEXT_PROBECHAN) {
 				/*
 				 * Actively scanning a channel marked passive;
 				 * send a probe request now that we know there
 				 * is 802.11 traffic present.
 				 *
 				 * XXX check if the beacon we recv'd gives
 				 * us what we need and suppress the probe req
 				 */
 				ieee80211_probe_curchan(vap, 1);
 				ic->ic_flags_ext &= ~IEEE80211_FEXT_PROBECHAN;
 			}
 			ieee80211_add_scan(vap, rxchan, &scan, wh,
 			    subtype, rssi, nf);
 			return;
 		}
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_AUTH: {
 		uint16_t algo, seq, status;
 		/*
 		 * auth frame format
 		 *	[2] algorithm
 		 *	[2] sequence
 		 *	[2] status
 		 *	[tlv*] challenge
 		 */
 		IEEE80211_VERIFY_LENGTH(efrm - frm, 6, return);
 		algo   = le16toh(*(uint16_t *)frm);
 		seq    = le16toh(*(uint16_t *)(frm + 2));
 		status = le16toh(*(uint16_t *)(frm + 4));
 		IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_AUTH, wh->i_addr2,
 		    "recv auth frame with algorithm %d seq %d", algo, seq);
 
 		if (vap->iv_flags & IEEE80211_F_COUNTERM) {
 			IEEE80211_DISCARD(vap,
 			    IEEE80211_MSG_AUTH | IEEE80211_MSG_CRYPTO,
 			    wh, "auth", "%s", "TKIP countermeasures enabled");
 			vap->iv_stats.is_rx_auth_countermeasures++;
 			if (vap->iv_opmode == IEEE80211_M_HOSTAP) {
 				ieee80211_send_error(ni, wh->i_addr2,
 					IEEE80211_FC0_SUBTYPE_AUTH,
 					IEEE80211_REASON_MIC_FAILURE);
 			}
 			return;
 		}
 		if (algo == IEEE80211_AUTH_ALG_SHARED)
 			sta_auth_shared(ni, wh, frm + 6, efrm, rssi, nf,
 			    seq, status);
 		else if (algo == IEEE80211_AUTH_ALG_OPEN)
 			sta_auth_open(ni, wh, rssi, nf, seq, status);
 		else {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 			    wh, "auth", "unsupported alg %d", algo);
 			vap->iv_stats.is_rx_auth_unsupported++;
 			return;
 		} 
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_RESP: {
 		uint16_t capinfo, associd;
 		uint16_t status;
 
 		if (vap->iv_state != IEEE80211_S_ASSOC) {
 			vap->iv_stats.is_rx_mgtdiscard++;
 			return;
 		}
 
 		/*
 		 * asresp frame format
 		 *	[2] capability information
 		 *	[2] status
 		 *	[2] association ID
 		 *	[tlv] supported rates
 		 *	[tlv] extended supported rates
 		 *	[tlv] WME
 		 *	[tlv] HT capabilities
 		 *	[tlv] HT info
 		 */
 		IEEE80211_VERIFY_LENGTH(efrm - frm, 6, return);
 		ni = vap->iv_bss;
 		capinfo = le16toh(*(uint16_t *)frm);
 		frm += 2;
 		status = le16toh(*(uint16_t *)frm);
 		frm += 2;
 		if (status != 0) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_ASSOC,
 			    wh->i_addr2, "%sassoc failed (reason %d)",
 			    ISREASSOC(subtype) ?  "re" : "", status);
 			vap->iv_stats.is_rx_auth_fail++;	/* XXX */
 			return;
 		}
 		associd = le16toh(*(uint16_t *)frm);
 		frm += 2;
 
 		rates = xrates = wme = htcap = htinfo = NULL;
 		vhtcap = vhtopmode = NULL;
 		while (efrm - frm > 1) {
 			IEEE80211_VERIFY_LENGTH(efrm - frm, frm[1] + 2, return);
 			switch (*frm) {
 			case IEEE80211_ELEMID_RATES:
 				rates = frm;
 				break;
 			case IEEE80211_ELEMID_XRATES:
 				xrates = frm;
 				break;
 			case IEEE80211_ELEMID_HTCAP:
 				htcap = frm;
 				break;
 			case IEEE80211_ELEMID_HTINFO:
 				htinfo = frm;
 				break;
 			case IEEE80211_ELEMID_VENDOR:
 				if (iswmeoui(frm))
 					wme = frm;
 				else if (vap->iv_flags_ht & IEEE80211_FHT_HTCOMPAT) {
 					/*
 					 * Accept pre-draft HT ie's if the
 					 * standard ones have not been seen.
 					 */
 					if (ishtcapoui(frm)) {
 						if (htcap == NULL)
 							htcap = frm;
 					} else if (ishtinfooui(frm)) {
 						if (htinfo == NULL)
 							htinfo = frm;
 					}
 				}
 				/* XXX Atheros OUI support */
 				break;
 			case IEEE80211_ELEMID_VHT_CAP:
 				vhtcap = frm;
 				break;
 			case IEEE80211_ELEMID_VHT_OPMODE:
 				vhtopmode = frm;
 				break;
 			}
 			frm += frm[1] + 2;
 		}
 
 		IEEE80211_VERIFY_ELEMENT(rates, IEEE80211_RATE_MAXSIZE, return);
 		if (xrates != NULL)
 			IEEE80211_VERIFY_ELEMENT(xrates,
 				IEEE80211_RATE_MAXSIZE - rates[1], return);
 		rate = ieee80211_setup_rates(ni, rates, xrates,
 				IEEE80211_F_JOIN |
 				IEEE80211_F_DOSORT | IEEE80211_F_DOFRATE |
 				IEEE80211_F_DONEGO | IEEE80211_F_DODEL);
 		if (rate & IEEE80211_RATE_BASIC) {
 			IEEE80211_NOTE_MAC(vap, IEEE80211_MSG_ASSOC,
 			    wh->i_addr2,
 			    "%sassoc failed (rate set mismatch)",
 			    ISREASSOC(subtype) ?  "re" : "");
 			vap->iv_stats.is_rx_assoc_norate++;
 			ieee80211_new_state(vap, IEEE80211_S_SCAN,
 			    IEEE80211_SCAN_FAIL_STATUS);
 			return;
 		}
 
 		ni->ni_capinfo = capinfo;
 		ni->ni_associd = associd;
 		if (ni->ni_jointime == 0)
 			ni->ni_jointime = time_uptime;
 		if (wme != NULL &&
 		    ieee80211_parse_wmeparams(vap, wme, wh, &qosinfo) >= 0) {
 			ni->ni_flags |= IEEE80211_NODE_QOS;
 			ieee80211_wme_updateparams(vap);
 		} else
 			ni->ni_flags &= ~IEEE80211_NODE_QOS;
 		/*
 		 * Setup HT state according to the negotiation.
 		 *
 		 * NB: shouldn't need to check if HT use is enabled but some
 		 *     ap's send back HT ie's even when we don't indicate we
 		 *     are HT capable in our AssocReq.
 		 */
 		if (htcap != NULL && htinfo != NULL &&
 		    (vap->iv_flags_ht & IEEE80211_FHT_HT)) {
 			ieee80211_ht_node_init(ni);
 			ieee80211_ht_updateparams(ni, htcap, htinfo);
 
 			if ((vhtcap != NULL) && (vhtopmode != NULL) &
 			    (vap->iv_flags_vht & IEEE80211_FVHT_VHT)) {
 				/*
 				 * Log if we get a VHT assoc/reassoc response.
 				 * We aren't ready for 2GHz VHT support.
 				 */
 				if (IEEE80211_IS_CHAN_2GHZ(ni->ni_chan)) {
 					printf("%s: peer %6D: VHT on 2GHz, ignoring\n",
 					    __func__,
 					    ni->ni_macaddr,
 					    ":");
 				} else {
 					ieee80211_vht_node_init(ni);
 					ieee80211_vht_updateparams(ni, vhtcap, vhtopmode);
 					ieee80211_setup_vht_rates(ni, vhtcap, vhtopmode);
 				}
 			}
 
 			ieee80211_ht_updateparams_final(ni, htcap, htinfo);
 			ieee80211_setup_htrates(ni, htcap,
 			     IEEE80211_F_JOIN | IEEE80211_F_DOBRS);
 			ieee80211_setup_basic_htrates(ni, htinfo);
 
 			ieee80211_node_setuptxparms(ni);
 			ieee80211_ratectl_node_init(ni);
 		}
 
 		/*
 		 * Always initialise FF/superg state; we can use this
 		 * for doing A-MSDU encapsulation as well.
 		 */
 #ifdef	IEEE80211_SUPPORT_SUPERG
 		ieee80211_ff_node_init(ni);
 #endif
 
 		/*
 		 * Configure state now that we are associated.
 		 *
 		 * XXX may need different/additional driver callbacks?
 		 */
 		if (IEEE80211_IS_CHAN_A(ic->ic_curchan) ||
 		    (ni->ni_capinfo & IEEE80211_CAPINFO_SHORT_PREAMBLE)) {
 			vap->iv_flags |= IEEE80211_F_SHPREAMBLE;
 			vap->iv_flags &= ~IEEE80211_F_USEBARKER;
 		} else {
 			vap->iv_flags &= ~IEEE80211_F_SHPREAMBLE;
 			vap->iv_flags |= IEEE80211_F_USEBARKER;
 		}
 		ieee80211_vap_set_shortslottime(vap,
 			IEEE80211_IS_CHAN_A(ic->ic_curchan) ||
 			(ni->ni_capinfo & IEEE80211_CAPINFO_SHORT_SLOTTIME));
 		ieee80211_vap_update_preamble(vap);
 		/*
 		 * Honor ERP protection.
 		 *
 		 * NB: ni_erp should zero for non-11g operation.
 		 */
 		if (IEEE80211_IS_CHAN_ANYG(ic->ic_curchan) &&
 		    (ni->ni_erp & IEEE80211_ERP_USE_PROTECTION))
 			vap->iv_flags |= IEEE80211_F_USEPROT;
 		else
 			vap->iv_flags &= ~IEEE80211_F_USEPROT;
 		ieee80211_vap_update_erp_protmode(vap);
 		IEEE80211_NOTE_MAC(vap,
 		    IEEE80211_MSG_ASSOC | IEEE80211_MSG_DEBUG, wh->i_addr2,
 		    "%sassoc success at aid %d: %s preamble, %s slot time%s%s%s%s%s%s%s%s%s",
 		    ISREASSOC(subtype) ? "re" : "",
 		    IEEE80211_NODE_AID(ni),
 		    vap->iv_flags&IEEE80211_F_SHPREAMBLE ? "short" : "long",
 		    vap->iv_flags&IEEE80211_F_SHSLOT ? "short" : "long",
 		    vap->iv_flags&IEEE80211_F_USEPROT ? ", protection" : "",
 		    ni->ni_flags & IEEE80211_NODE_QOS ? ", QoS" : "",
 		    ni->ni_flags & IEEE80211_NODE_HT ?
 			(ni->ni_chw == 40 ? ", HT40" : ", HT20") : "",
 		    ni->ni_flags & IEEE80211_NODE_AMPDU ? " (+AMPDU)" : "",
 		    ni->ni_flags & IEEE80211_NODE_AMSDU ? " (+AMSDU)" : "",
 		    ni->ni_flags & IEEE80211_NODE_MIMO_RTS ? " (+SMPS-DYN)" :
 			ni->ni_flags & IEEE80211_NODE_MIMO_PS ? " (+SMPS)" : "",
 		    ni->ni_flags & IEEE80211_NODE_RIFS ? " (+RIFS)" : "",
 		    IEEE80211_ATH_CAP(vap, ni, IEEE80211_NODE_FF) ?
 			", fast-frames" : "",
 		    IEEE80211_ATH_CAP(vap, ni, IEEE80211_NODE_TURBOP) ?
 			", turbo" : ""
 		);
 		ieee80211_new_state(vap, IEEE80211_S_RUN, subtype);
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_DEAUTH: {
 		uint16_t reason;
 
 		if (vap->iv_state == IEEE80211_S_SCAN) {
 			vap->iv_stats.is_rx_mgtdiscard++;
 			return;
 		}
 		if (!IEEE80211_ADDR_EQ(wh->i_addr1, vap->iv_myaddr)) {
 			/* NB: can happen when in promiscuous mode */
 			vap->iv_stats.is_rx_mgtdiscard++;
 			break;
 		}
 
 		/*
 		 * deauth frame format
 		 *	[2] reason
 		 */
 		IEEE80211_VERIFY_LENGTH(efrm - frm, 2, return);
 		reason = le16toh(*(uint16_t *)frm);
 
 		vap->iv_stats.is_rx_deauth++;
 		vap->iv_stats.is_rx_deauth_code = reason;
 		IEEE80211_NODE_STAT(ni, rx_deauth);
 
 		IEEE80211_NOTE(vap, IEEE80211_MSG_AUTH, ni,
 		    "recv deauthenticate (reason: %d (%s))", reason,
 		    ieee80211_reason_to_string(reason));
 		ieee80211_new_state(vap, IEEE80211_S_AUTH,
 		    (reason << 8) | IEEE80211_FC0_SUBTYPE_DEAUTH);
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_DISASSOC: {
 		uint16_t reason;
 
 		if (vap->iv_state != IEEE80211_S_RUN &&
 		    vap->iv_state != IEEE80211_S_ASSOC &&
 		    vap->iv_state != IEEE80211_S_AUTH) {
 			vap->iv_stats.is_rx_mgtdiscard++;
 			return;
 		}
 		if (!IEEE80211_ADDR_EQ(wh->i_addr1, vap->iv_myaddr)) {
 			/* NB: can happen when in promiscuous mode */
 			vap->iv_stats.is_rx_mgtdiscard++;
 			break;
 		}
 
 		/*
 		 * disassoc frame format
 		 *	[2] reason
 		 */
 		IEEE80211_VERIFY_LENGTH(efrm - frm, 2, return);
 		reason = le16toh(*(uint16_t *)frm);
 
 		vap->iv_stats.is_rx_disassoc++;
 		vap->iv_stats.is_rx_disassoc_code = reason;
 		IEEE80211_NODE_STAT(ni, rx_disassoc);
 
 		IEEE80211_NOTE(vap, IEEE80211_MSG_ASSOC, ni,
 		    "recv disassociate (reason: %d (%s))", reason,
 		    ieee80211_reason_to_string(reason));
 		ieee80211_new_state(vap, IEEE80211_S_ASSOC, 0);
 		break;
 	}
 
 	case IEEE80211_FC0_SUBTYPE_ACTION:
 	case IEEE80211_FC0_SUBTYPE_ACTION_NOACK:
 		if (!IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr1) &&
 		    !IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "not for us");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (vap->iv_state != IEEE80211_S_RUN) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "wrong state %s",
 			    ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else {
 			if (ieee80211_parse_action(ni, m0) == 0)
 				(void)ic->ic_recv_action(ni, wh, frm, efrm);
 		}
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 	case IEEE80211_FC0_SUBTYPE_TIMING_ADV:
 	case IEEE80211_FC0_SUBTYPE_ATIM:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 		    wh, NULL, "%s", "not handled");
 		vap->iv_stats.is_rx_mgtdiscard++;
 		break;
 
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "mgt", "subtype 0x%x not handled", subtype);
 		vap->iv_stats.is_rx_badsubtype++;
 		break;
 	}
 #undef ISREASSOC
 }
 
 static void
 sta_recv_ctl(struct ieee80211_node *ni, struct mbuf *m, int subtype)
 {
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_BAR:
 		ieee80211_recv_bar(ni, m);
 		break;
 	}
 }
diff --git a/sys/net80211/ieee80211_wds.c b/sys/net80211/ieee80211_wds.c
index 238b10a6e4ff..65a36a807d91 100644
--- a/sys/net80211/ieee80211_wds.c
+++ b/sys/net80211/ieee80211_wds.c
@@ -1,804 +1,805 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007-2008 Sam Leffler, Errno Consulting
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #ifdef __FreeBSD__
 __FBSDID("$FreeBSD$");
 #endif
 
 /*
  * IEEE 802.11 WDS mode support.
  */
 #include "opt_inet.h"
 #include "opt_wlan.h"
 
 #include <sys/param.h>
 #include <sys/systm.h> 
 #include <sys/mbuf.h>   
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_llc.h>
+#include <net/if_private.h>
 #include <net/ethernet.h>
 
 #include <net/bpf.h>
 
 #include <net80211/ieee80211_var.h>
 #include <net80211/ieee80211_wds.h>
 #include <net80211/ieee80211_input.h>
 #ifdef IEEE80211_SUPPORT_SUPERG
 #include <net80211/ieee80211_superg.h>
 #endif
 
 static void wds_vattach(struct ieee80211vap *);
 static int wds_newstate(struct ieee80211vap *, enum ieee80211_state, int);
 static	int wds_input(struct ieee80211_node *ni, struct mbuf *m,
 	    const struct ieee80211_rx_stats *rxs, int, int);
 static void wds_recv_mgmt(struct ieee80211_node *, struct mbuf *, int subtype,
 	const struct ieee80211_rx_stats *, int, int);
 
 void
 ieee80211_wds_attach(struct ieee80211com *ic)
 {
 	ic->ic_vattach[IEEE80211_M_WDS] = wds_vattach;
 }
 
 void
 ieee80211_wds_detach(struct ieee80211com *ic)
 {
 }
 
 static void
 wds_vdetach(struct ieee80211vap *vap)
 {
 	if (vap->iv_bss != NULL) {
 		/* XXX locking? */
 		if (vap->iv_bss->ni_wdsvap == vap)
 			vap->iv_bss->ni_wdsvap = NULL;
 	}
 }
 
 static void
 wds_vattach(struct ieee80211vap *vap)
 {
 	vap->iv_newstate = wds_newstate;
 	vap->iv_input = wds_input;
 	vap->iv_recv_mgmt = wds_recv_mgmt;
 	vap->iv_opdetach = wds_vdetach;
 }
 
 static void
 wds_flush(struct ieee80211_node *ni)
 {
 	struct ieee80211com *ic = ni->ni_ic;
 	struct mbuf *m, *next;
 	int8_t rssi, nf;
 
 	m = ieee80211_ageq_remove(&ic->ic_stageq,
 	    (void *)(uintptr_t) ieee80211_mac_hash(ic, ni->ni_macaddr));
 	if (m == NULL)
 		return;
 
 	IEEE80211_NOTE(ni->ni_vap, IEEE80211_MSG_WDS, ni,
 	    "%s", "flush wds queue");
 	ic->ic_node_getsignal(ni, &rssi, &nf);
 	for (; m != NULL; m = next) {
 		next = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		ieee80211_input(ni, m, rssi, nf);
 	}
 }
 
 static int
 ieee80211_create_wds(struct ieee80211vap *vap, struct ieee80211_channel *chan)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	struct ieee80211_node_table *nt = &ic->ic_sta;
 	struct ieee80211_node *ni, *obss;
 
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_WDS,
 	     "%s: creating link to %s on channel %u\n", __func__,
 	     ether_sprintf(vap->iv_des_bssid), ieee80211_chan2ieee(ic, chan));
 
 	/* NB: vap create must specify the bssid for the link */
 	KASSERT(vap->iv_flags & IEEE80211_F_DESBSSID, ("no bssid"));
 	/* NB: we should only be called on RUN transition */
 	KASSERT(vap->iv_state == IEEE80211_S_RUN, ("!RUN state"));
 
 	if ((vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY) == 0) {
 		/*
 		 * Dynamic/non-legacy WDS.  Reference the associated
 		 * station specified by the desired bssid setup at vap
 		 * create.  Point ni_wdsvap at the WDS vap so 4-address
 		 * frames received through the associated AP vap will
 		 * be dispatched upward (e.g. to a bridge) as though
 		 * they arrived on the WDS vap.
 		 */
 		IEEE80211_NODE_LOCK(nt);
 		obss = NULL;
 		ni = ieee80211_find_node_locked(&ic->ic_sta, vap->iv_des_bssid);
 		if (ni == NULL) {
 			/*
 			 * Node went away before we could hookup.  This
 			 * should be ok; no traffic will flow and a leave
 			 * event will be dispatched that should cause
 			 * the vap to be destroyed.
 			 */
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_WDS,
 			    "%s: station %s went away\n",
 			    __func__, ether_sprintf(vap->iv_des_bssid));
 			/* XXX stat? */
 		} else if (ni->ni_wdsvap != NULL) {
 			/*
 			 * Node already setup with a WDS vap; we cannot
 			 * allow multiple references so disallow.  If
 			 * ni_wdsvap points at us that's ok; we should
 			 * do nothing anyway.
 			 */
 			/* XXX printf instead? */
 			IEEE80211_DPRINTF(vap, IEEE80211_MSG_WDS,
 			    "%s: station %s in use with %s\n",
 			    __func__, ether_sprintf(vap->iv_des_bssid),
 			    ni->ni_wdsvap->iv_ifp->if_xname);
 			/* XXX stat? */
 		} else {
 			/*
 			 * Committed to new node, setup state.
 			 */
 			obss = vap->iv_update_bss(vap, ni);
 			ni->ni_wdsvap = vap;
 		}
 		IEEE80211_NODE_UNLOCK(nt);
 		if (obss != NULL) {
 			/* NB: deferred to avoid recursive lock */
 			ieee80211_free_node(obss);
 		}
 	} else {
 		/*
 		 * Legacy WDS vap setup.
 		 */
 		/*
 		 * The far end does not associate so we just create
 		 * create a new node and install it as the vap's
 		 * bss node.  We must simulate an association and
 		 * authorize the port for traffic to flow.
 		 * XXX check if node already in sta table?
 		 */
 		ni = ieee80211_node_create_wds(vap, vap->iv_des_bssid, chan);
 		if (ni != NULL) {
 			obss = vap->iv_update_bss(vap, ieee80211_ref_node(ni));
 			ni->ni_flags |= IEEE80211_NODE_AREF;
 			if (obss != NULL)
 				ieee80211_free_node(obss);
 			/* give driver a chance to setup state like ni_txrate */
 			if (ic->ic_newassoc != NULL)
 				ic->ic_newassoc(ni, 1);
 			/* tell the authenticator about new station */
 			if (vap->iv_auth->ia_node_join != NULL)
 				vap->iv_auth->ia_node_join(ni);
 			if (ni->ni_authmode != IEEE80211_AUTH_8021X)
 				ieee80211_node_authorize(ni);
 
 			ieee80211_notify_node_join(ni, 1 /*newassoc*/);
 			/* XXX inject l2uf frame */
 		}
 	}
 
 	/*
 	 * Flush any pending frames now that were setup.
 	 */
 	if (ni != NULL)
 		wds_flush(ni);
 	return (ni == NULL ? ENOENT : 0);
 }
 
 /*
  * Propagate multicast frames of an ap vap to all DWDS links.
  * The caller is assumed to have verified this frame is multicast.
  */
 void
 ieee80211_dwds_mcast(struct ieee80211vap *vap0, struct mbuf *m)
 {
 	struct ieee80211com *ic = vap0->iv_ic;
 	const struct ether_header *eh = mtod(m, const struct ether_header *);
 	struct ieee80211_node *ni;
 	struct ieee80211vap *vap;
 	struct ifnet *ifp;
 	struct mbuf *mcopy;
 	int err;
 
 	KASSERT(ETHER_IS_MULTICAST(eh->ether_dhost),
 	    ("%s not mcast", ether_sprintf(eh->ether_dhost)));
 
 	/* XXX locking */
 	TAILQ_FOREACH(vap, &ic->ic_vaps, iv_next) {
 		/* only DWDS vaps are interesting */
 		if (vap->iv_opmode != IEEE80211_M_WDS ||
 		    (vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY))
 			continue;
 		/* if it came in this interface, don't send it back out */
 		ifp = vap->iv_ifp;
 		if (ifp == m->m_pkthdr.rcvif)
 			continue;
 		/*
 		 * Duplicate the frame and send it.
 		 */
 		mcopy = m_copypacket(m, IEEE80211_M_NOWAIT);
 		if (mcopy == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			/* XXX stat + msg */
 			continue;
 		}
 		ni = ieee80211_find_txnode(vap, eh->ether_dhost);
 		if (ni == NULL) {
 			/* NB: ieee80211_find_txnode does stat+msg */
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(mcopy);
 			continue;
 		}
 		/* calculate priority so drivers can find the tx queue */
 		if (ieee80211_classify(ni, mcopy)) {
 			IEEE80211_DISCARD_MAC(vap,
 			    IEEE80211_MSG_OUTPUT | IEEE80211_MSG_WDS,
 			    eh->ether_dhost, NULL,
 			    "%s", "classification failure");
 			vap->iv_stats.is_tx_classify++;
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			m_freem(mcopy);
 			ieee80211_free_node(ni);
 			continue;
 		}
 
 		BPF_MTAP(ifp, m);		/* 802.3 tx */
 
 		/*
 		 * Encapsulate the packet in prep for transmission.
 		 */
 		IEEE80211_TX_LOCK(ic);
 		mcopy = ieee80211_encap(vap, ni, mcopy);
 		if (mcopy == NULL) {
 			/* NB: stat+msg handled in ieee80211_encap */
 			IEEE80211_TX_UNLOCK(ic);
 			ieee80211_free_node(ni);
 			continue;
 		}
 		mcopy->m_flags |= M_MCAST;
 		MPASS((mcopy->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 		mcopy->m_pkthdr.rcvif = (void *) ni;
 
 		err = ieee80211_parent_xmitpkt(ic, mcopy);
 		IEEE80211_TX_UNLOCK(ic);
 		if (!err) {
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OBYTES,
 			    m->m_pkthdr.len);
 		}
 	}
 }
 
 /*
  * Handle DWDS discovery on receipt of a 4-address frame in
  * ap mode.  Queue the frame and post an event for someone
  * to plumb the necessary WDS vap for this station.  Frames
  * received prior to the vap set running will then be reprocessed
  * as if they were just received.
  */
 void
 ieee80211_dwds_discover(struct ieee80211_node *ni, struct mbuf *m)
 {
 	struct ieee80211com *ic = ni->ni_ic;
 
 	/*
 	 * Save the frame with an aging interval 4 times
 	 * the listen interval specified by the station. 
 	 * Frames that sit around too long are reclaimed
 	 * using this information.
 	 * XXX handle overflow?
 	 * XXX per/vap beacon interval?
 	 */
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	m->m_pkthdr.rcvif = (void *)(uintptr_t)
 	    ieee80211_mac_hash(ic, ni->ni_macaddr);
 	(void) ieee80211_ageq_append(&ic->ic_stageq, m,
 	    ((ni->ni_intval * ic->ic_lintval) << 2) / 1024);
 	ieee80211_notify_wds_discover(ni);
 }
 
 /*
  * IEEE80211_M_WDS vap state machine handler.
  */
 static int
 wds_newstate(struct ieee80211vap *vap, enum ieee80211_state nstate, int arg)
 {
 	struct ieee80211com *ic = vap->iv_ic;
 	enum ieee80211_state ostate;
 	int error;
 
 	IEEE80211_LOCK_ASSERT(ic);
 
 	ostate = vap->iv_state;
 	IEEE80211_DPRINTF(vap, IEEE80211_MSG_STATE, "%s: %s -> %s\n", __func__,
 		ieee80211_state_name[ostate], ieee80211_state_name[nstate]);
 	vap->iv_state = nstate;			/* state transition */
 	callout_stop(&vap->iv_mgtsend);		/* XXX callout_drain */
 	if (ostate != IEEE80211_S_SCAN)
 		ieee80211_cancel_scan(vap);	/* background scan */
 	error = 0;
 	switch (nstate) {
 	case IEEE80211_S_INIT:
 		switch (ostate) {
 		case IEEE80211_S_SCAN:
 			ieee80211_cancel_scan(vap);
 			break;
 		default:
 			break;
 		}
 		if (ostate != IEEE80211_S_INIT) {
 			/* NB: optimize INIT -> INIT case */
 			ieee80211_reset_bss(vap);
 		}
 		break;
 	case IEEE80211_S_SCAN:
 		switch (ostate) {
 		case IEEE80211_S_INIT:
 			ieee80211_check_scan_current(vap);
 			break;
 		default:
 			break;
 		}
 		break;
 	case IEEE80211_S_RUN:
 		if (ostate == IEEE80211_S_INIT) {
 			/*
 			 * Already have a channel; bypass the scan
 			 * and startup immediately.
 			 */
 			error = ieee80211_create_wds(vap, ic->ic_curchan);
 		}
 		break;
 	default:
 		break;
 	}
 	return error;
 }
 
 /*
  * Process a received frame.  The node associated with the sender
  * should be supplied.  If nothing was found in the node table then
  * the caller is assumed to supply a reference to iv_bss instead.
  * The RSSI and a timestamp are also supplied.  The RSSI data is used
  * during AP scanning to select a AP to associate with; it can have
  * any units so long as values have consistent units and higher values
  * mean ``better signal''.  The receive timestamp is currently not used
  * by the 802.11 layer.
  */
 static int
 wds_input(struct ieee80211_node *ni, struct mbuf *m,
     const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ifnet *ifp = vap->iv_ifp;
 	struct ieee80211_frame *wh;
 	struct ieee80211_key *key;
 	struct ether_header *eh;
 	int hdrspace, need_tap = 1;	/* mbuf need to be tapped. */
 	uint8_t dir, type, subtype, qos;
 	int is_hw_decrypted = 0;
 	int has_decrypted = 0;
 
 	/*
 	 * Some devices do hardware decryption all the way through
 	 * to pretending the frame wasn't encrypted in the first place.
 	 * So, tag it appropriately so it isn't discarded inappropriately.
 	 */
 	if ((rxs != NULL) && (rxs->c_pktflags & IEEE80211_RX_F_DECRYPTED))
 		is_hw_decrypted = 1;
 
 	if (m->m_flags & M_AMPDU_MPDU) {
 		/*
 		 * Fastpath for A-MPDU reorder q resubmission.  Frames
 		 * w/ M_AMPDU_MPDU marked have already passed through
 		 * here but were received out of order and been held on
 		 * the reorder queue.  When resubmitted they are marked
 		 * with the M_AMPDU_MPDU flag and we can bypass most of
 		 * the normal processing.
 		 */
 		wh = mtod(m, struct ieee80211_frame *);
 		type = IEEE80211_FC0_TYPE_DATA;
 		dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 		subtype = IEEE80211_FC0_SUBTYPE_QOS_DATA;
 		hdrspace = ieee80211_hdrspace(ic, wh);	/* XXX optimize? */
 		goto resubmit_ampdu;
 	}
 
 	KASSERT(ni != NULL, ("null node"));
 
 	type = -1;			/* undefined */
 
 	if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_min)) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL,
 		    "too short (1): len %u", m->m_pkthdr.len);
 		vap->iv_stats.is_rx_tooshort++;
 		goto out;
 	}
 	/*
 	 * Bit of a cheat here, we use a pointer for a 3-address
 	 * frame format but don't reference fields past outside
 	 * ieee80211_frame_min w/o first validating the data is
 	 * present.
 	 */
 	wh = mtod(m, struct ieee80211_frame *);
 
 	if (!IEEE80211_IS_MULTICAST(wh->i_addr1))
 		ni->ni_inact = ni->ni_inact_reload;
 
 	if ((wh->i_fc[0] & IEEE80211_FC0_VERSION_MASK) !=
 	    IEEE80211_FC0_VERSION_0) {
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 		    ni->ni_macaddr, NULL, "wrong version, fc %02x:%02x",
 		    wh->i_fc[0], wh->i_fc[1]);
 		vap->iv_stats.is_rx_badversion++;
 		goto err;
 	}
 
 	dir = wh->i_fc[1] & IEEE80211_FC1_DIR_MASK;
 	type = wh->i_fc[0] & IEEE80211_FC0_TYPE_MASK;
 	subtype = wh->i_fc[0] & IEEE80211_FC0_SUBTYPE_MASK;
 
 	/* NB: WDS vap's do not scan */
 	if (m->m_pkthdr.len < sizeof(struct ieee80211_frame_addr4)) {
 		IEEE80211_DISCARD_MAC(vap,
 		    IEEE80211_MSG_ANY, ni->ni_macaddr, NULL,
 		    "too short (3): len %u", m->m_pkthdr.len);
 		vap->iv_stats.is_rx_tooshort++;
 		goto out;
 	}
 	/* NB: the TA is implicitly verified by finding the wds peer node */
 	if (!IEEE80211_ADDR_EQ(wh->i_addr1, vap->iv_myaddr) &&
 	    !IEEE80211_ADDR_EQ(wh->i_addr1, ifp->if_broadcastaddr)) {
 		/* not interested in */
 		IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 		    wh->i_addr1, NULL, "%s", "not to bss");
 		vap->iv_stats.is_rx_wrongbss++;
 		goto out;
 	}
 	IEEE80211_RSSI_LPF(ni->ni_avgrssi, rssi);
 	ni->ni_noise = nf;
 	if (IEEE80211_HAS_SEQ(type, subtype)) {
 		uint8_t tid = ieee80211_gettid(wh);
 		if (IEEE80211_QOS_HAS_SEQ(wh) &&
 		    TID_TO_WME_AC(tid) >= WME_AC_VI)
 			ic->ic_wme.wme_hipri_traffic++;
 		if (! ieee80211_check_rxseq(ni, wh, wh->i_addr1, rxs))
 			goto out;
 	}
 	switch (type) {
 	case IEEE80211_FC0_TYPE_DATA:
 		hdrspace = ieee80211_hdrspace(ic, wh);
 		if (m->m_len < hdrspace &&
 		    (m = m_pullup(m, hdrspace)) == NULL) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, NULL,
 			    "data too short: expecting %u", hdrspace);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;		/* XXX */
 		}
 		if (dir != IEEE80211_FC1_DIR_DSTODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto out;
 		}
 		/*
 		 * Only legacy WDS traffic should take this path.
 		 */
 		if ((vap->iv_flags_ext & IEEE80211_FEXT_WDSLEGACY) == 0) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "%s", "not legacy wds");
 			vap->iv_stats.is_rx_wrongdir++;/*XXX*/
 			goto out;
 		}
 		/*
 		 * Handle A-MPDU re-ordering.  If the frame is to be
 		 * processed directly then ieee80211_ampdu_reorder
 		 * will return 0; otherwise it has consumed the mbuf
 		 * and we should do nothing more with it.
 		 */
 		if ((m->m_flags & M_AMPDU) &&
 		    ieee80211_ampdu_reorder(ni, m, rxs) != 0) {
 			m = NULL;
 			goto out;
 		}
 	resubmit_ampdu:
 
 		/*
 		 * Handle privacy requirements.  Note that we
 		 * must not be preempted from here until after
 		 * we (potentially) call ieee80211_crypto_demic;
 		 * otherwise we may violate assumptions in the
 		 * crypto cipher modules used to do delayed update
 		 * of replay sequence numbers.
 		 */
 		if (is_hw_decrypted || IEEE80211_IS_PROTECTED(wh)) {
 			if ((vap->iv_flags & IEEE80211_F_PRIVACY) == 0) {
 				/*
 				 * Discard encrypted frames when privacy is off.
 				 */
 				IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 				    wh, "WEP", "%s", "PRIVACY off");
 				vap->iv_stats.is_rx_noprivacy++;
 				IEEE80211_NODE_STAT(ni, rx_noprivacy);
 				goto out;
 			}
 			if (ieee80211_crypto_decap(ni, m, hdrspace, &key) == 0) {
 				/* NB: stats+msgs handled in crypto_decap */
 				IEEE80211_NODE_STAT(ni, rx_wepfail);
 				goto out;
 			}
 			wh = mtod(m, struct ieee80211_frame *);
 			wh->i_fc[1] &= ~IEEE80211_FC1_PROTECTED;
 			has_decrypted = 1;
 		} else {
 			/* XXX M_WEP and IEEE80211_F_PRIVACY */
 			key = NULL;
 		}
 
 		/*
 		 * Save QoS bits for use below--before we strip the header.
 		 */
 		if (subtype == IEEE80211_FC0_SUBTYPE_QOS_DATA)
 			qos = ieee80211_getqos(wh)[0];
 		else
 			qos = 0;
 
 		/*
 		 * Next up, any fragmentation.
 		 */
 		if (!IEEE80211_IS_MULTICAST(wh->i_addr1)) {
 			m = ieee80211_defrag(ni, m, hdrspace, has_decrypted);
 			if (m == NULL) {
 				/* Fragment dropped or frame not complete yet */
 				goto out;
 			}
 		}
 		wh = NULL;		/* no longer valid, catch any uses */
 
 		/*
 		 * Next strip any MSDU crypto bits.
 		 */
 		if (!ieee80211_crypto_demic(vap, key, m, 0)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "demic error");
 			vap->iv_stats.is_rx_demicfail++;
 			IEEE80211_NODE_STAT(ni, rx_demicfail);
 			goto out;
 		}
 
 		/* copy to listener after decrypt */
 		if (ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		need_tap = 0;
 
 		/*
 		 * Finally, strip the 802.11 header.
 		 */
 		m = ieee80211_decap(vap, m, hdrspace, qos);
 		if (m == NULL) {
 			/* XXX mask bit to check for both */
 			/* don't count Null data frames as errors */
 			if (subtype == IEEE80211_FC0_SUBTYPE_NODATA ||
 			    subtype == IEEE80211_FC0_SUBTYPE_QOS_NULL)
 				goto out;
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 			    ni->ni_macaddr, "data", "%s", "decap error");
 			vap->iv_stats.is_rx_decap++;
 			IEEE80211_NODE_STAT(ni, rx_decap);
 			goto err;
 		}
 		if (!(qos & IEEE80211_QOS_AMSDU))
 			eh = mtod(m, struct ether_header *);
 		else
 			eh = NULL;
 		if (!ieee80211_node_is_authorized(ni)) {
 			/*
 			 * Deny any non-PAE frames received prior to
 			 * authorization.  For open/shared-key
 			 * authentication the port is mark authorized
 			 * after authentication completes.  For 802.1x
 			 * the port is not marked authorized by the
 			 * authenticator until the handshake has completed.
 			 */
 			if (eh == NULL ||
 			    eh->ether_type != htons(ETHERTYPE_PAE)) {
 				IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_INPUT,
 				    ni->ni_macaddr, "data", "unauthorized or "
 				    "unknown port: ether type 0x%x len %u",
 				    eh == NULL ? -1 : eh->ether_type,
 				    m->m_pkthdr.len);
 				vap->iv_stats.is_rx_unauth++;
 				IEEE80211_NODE_STAT(ni, rx_unauth);
 				goto err;
 			}
 		} else {
 			/*
 			 * When denying unencrypted frames, discard
 			 * any non-PAE frames received without encryption.
 			 */
 			if ((vap->iv_flags & IEEE80211_F_DROPUNENC) &&
 			    ((has_decrypted == 0) && (m->m_flags & M_WEP) == 0) &&
 			    (is_hw_decrypted == 0) &&
 			    (eh == NULL ||
 			     eh->ether_type != htons(ETHERTYPE_PAE))) {
 				/*
 				 * Drop unencrypted frames.
 				 */
 				vap->iv_stats.is_rx_unencrypted++;
 				IEEE80211_NODE_STAT(ni, rx_unencrypted);
 				goto out;
 			}
 		}
 		/* XXX require HT? */
 		if (qos & IEEE80211_QOS_AMSDU) {
 			m = ieee80211_decap_amsdu(ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 		} else {
 #ifdef IEEE80211_SUPPORT_SUPERG
 			m = ieee80211_decap_fastframe(vap, ni, m);
 			if (m == NULL)
 				return IEEE80211_FC0_TYPE_DATA;
 #endif
 		}
 		ieee80211_deliver_data(vap, ni, m);
 		return IEEE80211_FC0_TYPE_DATA;
 
 	case IEEE80211_FC0_TYPE_MGT:
 		vap->iv_stats.is_rx_mgmt++;
 		IEEE80211_NODE_STAT(ni, rx_mgmt);
 		if (dir != IEEE80211_FC1_DIR_NODS) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, "data", "incorrect dir 0x%x", dir);
 			vap->iv_stats.is_rx_wrongdir++;
 			goto err;
 		}
 		if (m->m_pkthdr.len < sizeof(struct ieee80211_frame)) {
 			IEEE80211_DISCARD_MAC(vap, IEEE80211_MSG_ANY,
 			    ni->ni_macaddr, "mgt", "too short: len %u",
 			    m->m_pkthdr.len);
 			vap->iv_stats.is_rx_tooshort++;
 			goto out;
 		}
 #ifdef IEEE80211_DEBUG
 		if (ieee80211_msg_debug(vap) || ieee80211_msg_dumppkts(vap)) {
 			if_printf(ifp, "received %s from %s rssi %d\n",
 			    ieee80211_mgt_subtype_name(subtype),
 			    ether_sprintf(wh->i_addr2), rssi);
 		}
 #endif
 		if (IEEE80211_IS_PROTECTED(wh)) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "WEP set but not permitted");
 			vap->iv_stats.is_rx_mgtdiscard++; /* XXX */
 			goto out;
 		}
 		vap->iv_recv_mgmt(ni, m, subtype, rxs, rssi, nf);
 		goto out;
 
 	case IEEE80211_FC0_TYPE_CTL:
 		vap->iv_stats.is_rx_ctl++;
 		IEEE80211_NODE_STAT(ni, rx_ctrl);
 		goto out;
 
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "bad", "frame type 0x%x", type);
 		/* should not come here */
 		break;
 	}
 err:
 	if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 out:
 	if (m != NULL) {
 		if (need_tap && ieee80211_radiotap_active_vap(vap))
 			ieee80211_radiotap_rx(vap, m);
 		m_freem(m);
 	}
 	return type;
 }
 
 static void
 wds_recv_mgmt(struct ieee80211_node *ni, struct mbuf *m0, int subtype,
     const struct ieee80211_rx_stats *rxs, int rssi, int nf)
 {
 	struct ieee80211vap *vap = ni->ni_vap;
 	struct ieee80211com *ic = ni->ni_ic;
 	struct ieee80211_frame *wh;
 	u_int8_t *frm, *efrm;
 
 	wh = mtod(m0, struct ieee80211_frame *);
 	frm = (u_int8_t *)&wh[1];
 	efrm = mtod(m0, u_int8_t *) + m0->m_len;
 	switch (subtype) {
 	case IEEE80211_FC0_SUBTYPE_ACTION:
 	case IEEE80211_FC0_SUBTYPE_ACTION_NOACK:
 		if (ni == vap->iv_bss) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "unknown node");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (!IEEE80211_ADDR_EQ(vap->iv_myaddr, wh->i_addr1)) {
 			/* NB: not interested in multicast frames. */
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "%s", "not for us");
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else if (vap->iv_state != IEEE80211_S_RUN) {
 			IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 			    wh, NULL, "wrong state %s",
 			    ieee80211_state_name[vap->iv_state]);
 			vap->iv_stats.is_rx_mgtdiscard++;
 		} else {
 			if (ieee80211_parse_action(ni, m0) == 0)
 				(void)ic->ic_recv_action(ni, wh, frm, efrm);
 		}
 		break;
 
 	case IEEE80211_FC0_SUBTYPE_ASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_ASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_REQ:
 	case IEEE80211_FC0_SUBTYPE_REASSOC_RESP:
 	case IEEE80211_FC0_SUBTYPE_PROBE_REQ:
 	case IEEE80211_FC0_SUBTYPE_PROBE_RESP:
 	case IEEE80211_FC0_SUBTYPE_TIMING_ADV:
 	case IEEE80211_FC0_SUBTYPE_BEACON:
 	case IEEE80211_FC0_SUBTYPE_ATIM:
 	case IEEE80211_FC0_SUBTYPE_DISASSOC:
 	case IEEE80211_FC0_SUBTYPE_AUTH:
 	case IEEE80211_FC0_SUBTYPE_DEAUTH:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_INPUT,
 		    wh, NULL, "%s", "not handled");
 		vap->iv_stats.is_rx_mgtdiscard++;
 		break;
 
 	default:
 		IEEE80211_DISCARD(vap, IEEE80211_MSG_ANY,
 		    wh, "mgt", "subtype 0x%x not handled", subtype);
 		vap->iv_stats.is_rx_badsubtype++;
 		break;
 	}
 }
diff --git a/sys/netgraph/netflow/netflow.c b/sys/netgraph/netflow/netflow.c
index 9c58674779ae..00fb0d9f68a7 100644
--- a/sys/netgraph/netflow/netflow.c
+++ b/sys/netgraph/netflow/netflow.c
@@ -1,1180 +1,1181 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010-2011 Alexander V. Chernikov <melifaro@ipfw.ru>
  * Copyright (c) 2004-2005 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright (c) 2001-2003 Roman V. Palagin <romanp@unshadow.net>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $SourceForge: netflow.c,v 1.41 2004/09/05 11:41:10 glebius Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 #include <sys/param.h>
 #include <sys/bitstring.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/mbuf.h>
 #include <sys/syslog.h>
 #include <sys/socket.h>
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/route/route_ctl.h>
 #include <net/ethernet.h>
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 
 #include <netinet6/in6_fib.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 
 #include <netgraph/netflow/netflow.h>
 #include <netgraph/netflow/netflow_v9.h>
 #include <netgraph/netflow/ng_netflow.h>
 
 #define	NBUCKETS	(65536)		/* must be power of 2 */
 
 /* This hash is for TCP or UDP packets. */
 #define FULL_HASH(addr1, addr2, port1, port2)	\
 	(((addr1 ^ (addr1 >> 16) ^ 		\
 	htons(addr2 ^ (addr2 >> 16))) ^ 	\
 	port1 ^ htons(port2)) &			\
 	(NBUCKETS - 1))
 
 /* This hash is for all other IP packets. */
 #define ADDR_HASH(addr1, addr2)			\
 	((addr1 ^ (addr1 >> 16) ^ 		\
 	htons(addr2 ^ (addr2 >> 16))) &		\
 	(NBUCKETS - 1))
 
 /* Macros to shorten logical constructions */
 /* XXX: priv must exist in namespace */
 #define	INACTIVE(fle)	(time_uptime - fle->f.last > priv->nfinfo_inact_t)
 #define	AGED(fle)	(time_uptime - fle->f.first > priv->nfinfo_act_t)
 #define	ISFREE(fle)	(fle->f.packets == 0)
 
 /*
  * 4 is a magical number: statistically number of 4-packet flows is
  * bigger than 5,6,7...-packet flows by an order of magnitude. Most UDP/ICMP
  * scans are 1 packet (~ 90% of flow cache). TCP scans are 2-packet in case
  * of reachable host and 4-packet otherwise.
  */
 #define	SMALL(fle)	(fle->f.packets <= 4)
 
 MALLOC_DEFINE(M_NETFLOW_HASH, "netflow_hash", "NetFlow hash");
 
 static int export_add(item_p, struct flow_entry *);
 static int export_send(priv_p, fib_export_p, item_p, int);
 
 #ifdef INET
 static int hash_insert(priv_p, struct flow_hash_entry *, struct flow_rec *,
     int, uint8_t, uint8_t);
 #endif
 #ifdef INET6
 static int hash6_insert(priv_p, struct flow_hash_entry *, struct flow6_rec *,
     int, uint8_t, uint8_t);
 #endif
 
 static void expire_flow(priv_p, fib_export_p, struct flow_entry *, int);
 
 #ifdef INET
 /*
  * Generate hash for a given flow record.
  *
  * FIB is not used here, because:
  * most VRFS will carry public IPv4 addresses which are unique even
  * without FIB private addresses can overlap, but this is worked out
  * via flow_rec bcmp() containing fib id. In IPv6 world addresses are
  * all globally unique (it's not fully true, there is FC00::/7 for example,
  * but chances of address overlap are MUCH smaller)
  */
 static inline uint32_t
 ip_hash(struct flow_rec *r)
 {
 
 	switch (r->r_ip_p) {
 	case IPPROTO_TCP:
 	case IPPROTO_UDP:
 		return FULL_HASH(r->r_src.s_addr, r->r_dst.s_addr,
 		    r->r_sport, r->r_dport);
 	default:
 		return ADDR_HASH(r->r_src.s_addr, r->r_dst.s_addr);
 	}
 }
 #endif
 
 #ifdef INET6
 /* Generate hash for a given flow6 record. Use lower 4 octets from v6 addresses */
 static inline uint32_t
 ip6_hash(struct flow6_rec *r)
 {
 
 	switch (r->r_ip_p) {
 	case IPPROTO_TCP:
 	case IPPROTO_UDP:
 		return FULL_HASH(r->src.r_src6.__u6_addr.__u6_addr32[3],
 		    r->dst.r_dst6.__u6_addr.__u6_addr32[3], r->r_sport,
 		    r->r_dport);
 	default:
 		return ADDR_HASH(r->src.r_src6.__u6_addr.__u6_addr32[3],
 		    r->dst.r_dst6.__u6_addr.__u6_addr32[3]);
  	}
 }
 
 #endif
 
 /*
  * Detach export datagram from priv, if there is any.
  * If there is no, allocate a new one.
  */
 static item_p
 get_export_dgram(priv_p priv, fib_export_p fe)
 {
 	item_p	item = NULL;
 
 	mtx_lock(&fe->export_mtx);
 	if (fe->exp.item != NULL) {
 		item = fe->exp.item;
 		fe->exp.item = NULL;
 	}
 	mtx_unlock(&fe->export_mtx);
 
 	if (item == NULL) {
 		struct netflow_v5_export_dgram *dgram;
 		struct mbuf *m;
 
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			return (NULL);
 		item = ng_package_data(m, NG_NOFLAGS);
 		if (item == NULL)
 			return (NULL);
 		dgram = mtod(m, struct netflow_v5_export_dgram *);
 		dgram->header.count = 0;
 		dgram->header.version = htons(NETFLOW_V5);
 		dgram->header.pad = 0;
 	}
 
 	return (item);
 }
 
 /*
  * Re-attach incomplete datagram back to priv.
  * If there is already another one, then send incomplete. */
 static void
 return_export_dgram(priv_p priv, fib_export_p fe, item_p item, int flags)
 {
 
 	/*
 	 * It may happen on SMP, that some thread has already
 	 * put its item there, in this case we bail out and
 	 * send what we have to collector.
 	 */
 	mtx_lock(&fe->export_mtx);
 	if (fe->exp.item == NULL) {
 		fe->exp.item = item;
 		mtx_unlock(&fe->export_mtx);
 	} else {
 		mtx_unlock(&fe->export_mtx);
 		export_send(priv, fe, item, flags);
 	}
 }
 
 /*
  * The flow is over. Call export_add() and free it. If datagram is
  * full, then call export_send().
  */
 static void
 expire_flow(priv_p priv, fib_export_p fe, struct flow_entry *fle, int flags)
 {
 	struct netflow_export_item exp;
 	uint16_t version = fle->f.version;
 
 	if ((priv->export != NULL) && (version == IPVERSION)) {
 		exp.item = get_export_dgram(priv, fe);
 		if (exp.item == NULL) {
 			priv->nfinfo_export_failed++;
 			if (priv->export9 != NULL)
 				priv->nfinfo_export9_failed++;
 			/* fle definitely contains IPv4 flow. */
 			uma_zfree_arg(priv->zone, fle, priv);
 			return;
 		}
 
 		if (export_add(exp.item, fle) > 0)
 			export_send(priv, fe, exp.item, flags);
 		else
 			return_export_dgram(priv, fe, exp.item, NG_QUEUE);
 	}
 
 	if (priv->export9 != NULL) {
 		exp.item9 = get_export9_dgram(priv, fe, &exp.item9_opt);
 		if (exp.item9 == NULL) {
 			priv->nfinfo_export9_failed++;
 			if (version == IPVERSION)
 				uma_zfree_arg(priv->zone, fle, priv);
 #ifdef INET6
 			else if (version == IP6VERSION)
 				uma_zfree_arg(priv->zone6, fle, priv);
 #endif
 			else
 				panic("ng_netflow: Unknown IP proto: %d",
 				    version);
 			return;
 		}
 
 		if (export9_add(exp.item9, exp.item9_opt, fle) > 0)
 			export9_send(priv, fe, exp.item9, exp.item9_opt, flags);
 		else
 			return_export9_dgram(priv, fe, exp.item9,
 			    exp.item9_opt, NG_QUEUE);
 	}
 
 	if (version == IPVERSION)
 		uma_zfree_arg(priv->zone, fle, priv);
 #ifdef INET6
 	else if (version == IP6VERSION)
 		uma_zfree_arg(priv->zone6, fle, priv);
 #endif
 }
 
 /* Get a snapshot of node statistics */
 void
 ng_netflow_copyinfo(priv_p priv, struct ng_netflow_info *i)
 {
 
 	i->nfinfo_bytes = counter_u64_fetch(priv->nfinfo_bytes);
 	i->nfinfo_packets = counter_u64_fetch(priv->nfinfo_packets);
 	i->nfinfo_bytes6 = counter_u64_fetch(priv->nfinfo_bytes6);
 	i->nfinfo_packets6 = counter_u64_fetch(priv->nfinfo_packets6);
 	i->nfinfo_sbytes = counter_u64_fetch(priv->nfinfo_sbytes);
 	i->nfinfo_spackets = counter_u64_fetch(priv->nfinfo_spackets);
 	i->nfinfo_sbytes6 = counter_u64_fetch(priv->nfinfo_sbytes6);
 	i->nfinfo_spackets6 = counter_u64_fetch(priv->nfinfo_spackets6);
 	i->nfinfo_act_exp = counter_u64_fetch(priv->nfinfo_act_exp);
 	i->nfinfo_inact_exp = counter_u64_fetch(priv->nfinfo_inact_exp);
 
 	i->nfinfo_used = uma_zone_get_cur(priv->zone);
 #ifdef INET6
 	i->nfinfo_used6 = uma_zone_get_cur(priv->zone6);
 #endif
 
 	i->nfinfo_alloc_failed = priv->nfinfo_alloc_failed;
 	i->nfinfo_export_failed = priv->nfinfo_export_failed;
 	i->nfinfo_export9_failed = priv->nfinfo_export9_failed;
 	i->nfinfo_realloc_mbuf = priv->nfinfo_realloc_mbuf;
 	i->nfinfo_alloc_fibs = priv->nfinfo_alloc_fibs;
 	i->nfinfo_inact_t = priv->nfinfo_inact_t;
 	i->nfinfo_act_t = priv->nfinfo_act_t;
 }
 
 /*
  * Insert a record into defined slot.
  *
  * First we get for us a free flow entry, then fill in all
  * possible fields in it.
  *
  * TODO: consider dropping hash mutex while filling in datagram,
  * as this was done in previous version. Need to test & profile
  * to be sure.
  */
 #ifdef INET
 static int
 hash_insert(priv_p priv, struct flow_hash_entry *hsh, struct flow_rec *r,
 	int plen, uint8_t flags, uint8_t tcp_flags)
 {
 	struct flow_entry *fle;
 
 	mtx_assert(&hsh->mtx, MA_OWNED);
 
 	fle = uma_zalloc_arg(priv->zone, priv, M_NOWAIT);
 	if (fle == NULL) {
 		priv->nfinfo_alloc_failed++;
 		return (ENOMEM);
 	}
 
 	/*
 	 * Now fle is totally ours. It is detached from all lists,
 	 * we can safely edit it.
 	 */
 	fle->f.version = IPVERSION;
 	bcopy(r, &fle->f.r, sizeof(struct flow_rec));
 	fle->f.bytes = plen;
 	fle->f.packets = 1;
 	fle->f.tcp_flags = tcp_flags;
 
 	fle->f.first = fle->f.last = time_uptime;
 
 	/*
 	 * First we do route table lookup on destination address. So we can
 	 * fill in out_ifx, dst_mask, nexthop, and dst_as in future releases.
 	 */
 	if ((flags & NG_NETFLOW_CONF_NODSTLOOKUP) == 0) {
 		struct rtentry *rt;
 		struct route_nhop_data rnd;
 
 		rt = fib4_lookup_rt(r->fib, fle->f.r.r_dst, 0, NHR_NONE, &rnd);
 		if (rt != NULL) {
 			struct in_addr addr;
 			uint32_t scopeid;
 			struct nhop_object *nh = nhop_select_func(rnd.rnd_nhop, 0);
 			int plen;
 
 			rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid);
 			fle->f.fle_o_ifx = nh->nh_ifp->if_index;
 			if (nh->gw_sa.sa_family == AF_INET)
 				fle->f.next_hop = nh->gw4_sa.sin_addr;
 			/*
 			 * XXX we're leaving an empty gateway here for
 			 * IPv6 nexthops.
 			 */
 			fle->f.dst_mask = plen;
 		}
 	}
 
 	/* Do route lookup on source address, to fill in src_mask. */
 	if ((flags & NG_NETFLOW_CONF_NOSRCLOOKUP) == 0) {
 		struct rtentry *rt;
 		struct route_nhop_data rnd;
 
 		rt = fib4_lookup_rt(r->fib, fle->f.r.r_src, 0, NHR_NONE, &rnd);
 		if (rt != NULL) {
 			struct in_addr addr;
 			uint32_t scopeid;
 			int plen;
 
 			rt_get_inet_prefix_plen(rt, &addr, &plen, &scopeid);
 			fle->f.src_mask = plen;
 		}
 	}
 
 	/* Push new flow at the and of hash. */
 	TAILQ_INSERT_TAIL(&hsh->head, fle, fle_hash);
 
 	return (0);
 }
 #endif
 
 #ifdef INET6
 static int
 hash6_insert(priv_p priv, struct flow_hash_entry *hsh6, struct flow6_rec *r,
 	int plen, uint8_t flags, uint8_t tcp_flags)
 {
 	struct flow6_entry *fle6;
 
 	mtx_assert(&hsh6->mtx, MA_OWNED);
 
 	fle6 = uma_zalloc_arg(priv->zone6, priv, M_NOWAIT);
 	if (fle6 == NULL) {
 		priv->nfinfo_alloc_failed++;
 		return (ENOMEM);
 	}
 
 	/*
 	 * Now fle is totally ours. It is detached from all lists,
 	 * we can safely edit it.
 	 */
 
 	fle6->f.version = IP6VERSION;
 	bcopy(r, &fle6->f.r, sizeof(struct flow6_rec));
 	fle6->f.bytes = plen;
 	fle6->f.packets = 1;
 	fle6->f.tcp_flags = tcp_flags;
 
 	fle6->f.first = fle6->f.last = time_uptime;
 
 	/*
 	 * First we do route table lookup on destination address. So we can
 	 * fill in out_ifx, dst_mask, nexthop, and dst_as in future releases.
 	 */
 	if ((flags & NG_NETFLOW_CONF_NODSTLOOKUP) == 0) {
 		struct rtentry *rt;
 		struct route_nhop_data rnd;
 
 		rt = fib6_lookup_rt(r->fib, &fle6->f.r.dst.r_dst6, 0, NHR_NONE, &rnd);
 		if (rt != NULL) {
 			struct in6_addr addr;
 			uint32_t scopeid;
 			struct nhop_object *nh = nhop_select_func(rnd.rnd_nhop, 0);
 			int plen;
 
 			rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid);
 			fle6->f.fle_o_ifx = nh->nh_ifp->if_index;
 			if (nh->gw_sa.sa_family == AF_INET6)
 				fle6->f.n.next_hop6 = nh->gw6_sa.sin6_addr;
 			fle6->f.dst_mask = plen;
 		}
 	}
 
 	if ((flags & NG_NETFLOW_CONF_NOSRCLOOKUP) == 0) {
 		/* Do route lookup on source address, to fill in src_mask. */
 		struct rtentry *rt;
 		struct route_nhop_data rnd;
 
 		rt = fib6_lookup_rt(r->fib, &fle6->f.r.src.r_src6, 0, NHR_NONE, &rnd);
 		if (rt != NULL) {
 			struct in6_addr addr;
 			uint32_t scopeid;
 			int plen;
 
 			rt_get_inet6_prefix_plen(rt, &addr, &plen, &scopeid);
 			fle6->f.src_mask = plen;
 		}
 	}
 
 	/* Push new flow at the and of hash. */
 	TAILQ_INSERT_TAIL(&hsh6->head, (struct flow_entry *)fle6, fle_hash);
 
 	return (0);
 }
 #endif
 
 /*
  * Non-static functions called from ng_netflow.c
  */
 
 /* Allocate memory and set up flow cache */
 void
 ng_netflow_cache_init(priv_p priv)
 {
 	struct flow_hash_entry *hsh;
 	int i;
 
 	/* Initialize cache UMA zone. */
 	priv->zone = uma_zcreate("NetFlow IPv4 cache",
 	    sizeof(struct flow_entry), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 	uma_zone_set_max(priv->zone, CACHESIZE);
 #ifdef INET6	
 	priv->zone6 = uma_zcreate("NetFlow IPv6 cache",
 	    sizeof(struct flow6_entry), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 	uma_zone_set_max(priv->zone6, CACHESIZE);
 #endif	
 
 	/* Allocate hash. */
 	priv->hash = malloc(NBUCKETS * sizeof(struct flow_hash_entry),
 	    M_NETFLOW_HASH, M_WAITOK | M_ZERO);
 
 	/* Initialize hash. */
 	for (i = 0, hsh = priv->hash; i < NBUCKETS; i++, hsh++) {
 		mtx_init(&hsh->mtx, "hash mutex", NULL, MTX_DEF);
 		TAILQ_INIT(&hsh->head);
 	}
 
 #ifdef INET6
 	/* Allocate hash. */
 	priv->hash6 = malloc(NBUCKETS * sizeof(struct flow_hash_entry),
 	    M_NETFLOW_HASH, M_WAITOK | M_ZERO);
 
 	/* Initialize hash. */
 	for (i = 0, hsh = priv->hash6; i < NBUCKETS; i++, hsh++) {
 		mtx_init(&hsh->mtx, "hash mutex", NULL, MTX_DEF);
 		TAILQ_INIT(&hsh->head);
 	}
 #endif
 
 	priv->nfinfo_bytes = counter_u64_alloc(M_WAITOK);
 	priv->nfinfo_packets = counter_u64_alloc(M_WAITOK);
 	priv->nfinfo_bytes6 = counter_u64_alloc(M_WAITOK);
 	priv->nfinfo_packets6 = counter_u64_alloc(M_WAITOK);
 	priv->nfinfo_sbytes = counter_u64_alloc(M_WAITOK);
 	priv->nfinfo_spackets = counter_u64_alloc(M_WAITOK);
 	priv->nfinfo_sbytes6 = counter_u64_alloc(M_WAITOK);
 	priv->nfinfo_spackets6 = counter_u64_alloc(M_WAITOK);
 	priv->nfinfo_act_exp = counter_u64_alloc(M_WAITOK);
 	priv->nfinfo_inact_exp = counter_u64_alloc(M_WAITOK);
 
 	ng_netflow_v9_cache_init(priv);
 	CTR0(KTR_NET, "ng_netflow startup()");
 }
 
 /* Initialize new FIB table for v5 and v9 */
 int
 ng_netflow_fib_init(priv_p priv, int fib)
 {
 	fib_export_p	fe = priv_to_fib(priv, fib);
 
 	CTR1(KTR_NET, "ng_netflow(): fib init: %d", fib);
 
 	if (fe != NULL)
 		return (0);
 
 	if ((fe = malloc(sizeof(struct fib_export), M_NETGRAPH,
 	    M_NOWAIT | M_ZERO)) == NULL)
 		return (ENOMEM);
 
 	mtx_init(&fe->export_mtx, "export dgram lock", NULL, MTX_DEF);
 	mtx_init(&fe->export9_mtx, "export9 dgram lock", NULL, MTX_DEF);
 	fe->fib = fib;
 	fe->domain_id = fib;
 
 	if (atomic_cmpset_ptr((volatile uintptr_t *)&priv->fib_data[fib],
 	    (uintptr_t)NULL, (uintptr_t)fe) == 0) {
 		/* FIB already set up by other ISR */
 		CTR3(KTR_NET, "ng_netflow(): fib init: %d setup %p but got %p",
 		    fib, fe, priv_to_fib(priv, fib));
 		mtx_destroy(&fe->export_mtx);
 		mtx_destroy(&fe->export9_mtx);
 		free(fe, M_NETGRAPH);
 	} else {
 		/* Increase counter for statistics */
 		CTR3(KTR_NET, "ng_netflow(): fib %d setup to %p (%p)",
 		    fib, fe, priv_to_fib(priv, fib));
 		priv->nfinfo_alloc_fibs++;
 	}
 
 	return (0);
 }
 
 /* Free all flow cache memory. Called from node close method. */
 void
 ng_netflow_cache_flush(priv_p priv)
 {
 	struct flow_entry	*fle, *fle1;
 	struct flow_hash_entry	*hsh;
 	struct netflow_export_item exp;
 	fib_export_p fe;
 	int i;
 
 	bzero(&exp, sizeof(exp));
 
 	/*
 	 * We are going to free probably billable data.
 	 * Expire everything before freeing it.
 	 * No locking is required since callout is already drained.
 	 */
 	for (hsh = priv->hash, i = 0; i < NBUCKETS; hsh++, i++)
 		TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) {
 			TAILQ_REMOVE(&hsh->head, fle, fle_hash);
 			fe = priv_to_fib(priv, fle->f.r.fib);
 			expire_flow(priv, fe, fle, NG_QUEUE);
 		}
 #ifdef INET6
 	for (hsh = priv->hash6, i = 0; i < NBUCKETS; hsh++, i++)
 		TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) {
 			TAILQ_REMOVE(&hsh->head, fle, fle_hash);
 			fe = priv_to_fib(priv, fle->f.r.fib);
 			expire_flow(priv, fe, fle, NG_QUEUE);
 		}
 #endif
 
 	uma_zdestroy(priv->zone);
 	/* Destroy hash mutexes. */
 	for (i = 0, hsh = priv->hash; i < NBUCKETS; i++, hsh++)
 		mtx_destroy(&hsh->mtx);
 
 	/* Free hash memory. */
 	if (priv->hash != NULL)
 		free(priv->hash, M_NETFLOW_HASH);
 #ifdef INET6
 	uma_zdestroy(priv->zone6);
 	/* Destroy hash mutexes. */
 	for (i = 0, hsh = priv->hash6; i < NBUCKETS; i++, hsh++)
 		mtx_destroy(&hsh->mtx);
 
 	/* Free hash memory. */
 	if (priv->hash6 != NULL)
 		free(priv->hash6, M_NETFLOW_HASH);
 #endif
 
 	for (i = 0; i < priv->maxfibs; i++) {
 		if ((fe = priv_to_fib(priv, i)) == NULL)
 			continue;
 
 		if (fe->exp.item != NULL)
 			export_send(priv, fe, fe->exp.item, NG_QUEUE);
 
 		if (fe->exp.item9 != NULL)
 			export9_send(priv, fe, fe->exp.item9,
 			    fe->exp.item9_opt, NG_QUEUE);
 
 		mtx_destroy(&fe->export_mtx);
 		mtx_destroy(&fe->export9_mtx);
 		free(fe, M_NETGRAPH);
 	}
 
 	counter_u64_free(priv->nfinfo_bytes);
 	counter_u64_free(priv->nfinfo_packets);
 	counter_u64_free(priv->nfinfo_bytes6);
 	counter_u64_free(priv->nfinfo_packets6);
 	counter_u64_free(priv->nfinfo_sbytes);
 	counter_u64_free(priv->nfinfo_spackets);
 	counter_u64_free(priv->nfinfo_sbytes6);
 	counter_u64_free(priv->nfinfo_spackets6);
 	counter_u64_free(priv->nfinfo_act_exp);
 	counter_u64_free(priv->nfinfo_inact_exp);
 
 	ng_netflow_v9_cache_flush(priv);
 }
 
 #ifdef INET
 /* Insert packet from into flow cache. */
 int
 ng_netflow_flow_add(priv_p priv, fib_export_p fe, struct ip *ip,
     caddr_t upper_ptr, uint8_t upper_proto, uint8_t flags,
     unsigned int src_if_index)
 {
 	struct flow_entry	*fle, *fle1;
 	struct flow_hash_entry	*hsh;
 	struct flow_rec		r;
 	int			hlen, plen;
 	int			error = 0;
 	uint8_t			tcp_flags = 0;
 
 	bzero(&r, sizeof(r));
 
 	if (ip->ip_v != IPVERSION)
 		return (EINVAL);
 
 	hlen = ip->ip_hl << 2;
 	if (hlen < sizeof(struct ip))
 		return (EINVAL);
 
 	/* Assume L4 template by default */
 	r.flow_type = NETFLOW_V9_FLOW_V4_L4;
 
 	r.r_src = ip->ip_src;
 	r.r_dst = ip->ip_dst;
 	r.fib = fe->fib;
 
 	plen = ntohs(ip->ip_len);
 
 	r.r_ip_p = ip->ip_p;
 	r.r_tos = ip->ip_tos;
 
 	r.r_i_ifx = src_if_index;
 
 	/*
 	 * XXX NOTE: only first fragment of fragmented TCP, UDP and
 	 * ICMP packet will be recorded with proper s_port and d_port.
 	 * Following fragments will be recorded simply as IP packet with
 	 * ip_proto = ip->ip_p and s_port, d_port set to zero.
 	 * I know, it looks like bug. But I don't want to re-implement
 	 * ip packet assebmling here. Anyway, (in)famous trafd works this way -
 	 * and nobody complains yet :)
 	 */
 	if ((ip->ip_off & htons(IP_OFFMASK)) == 0)
 		switch(r.r_ip_p) {
 		case IPPROTO_TCP:
 		    {
 			struct tcphdr *tcp;
 
 			tcp = (struct tcphdr *)((caddr_t )ip + hlen);
 			r.r_sport = tcp->th_sport;
 			r.r_dport = tcp->th_dport;
 			tcp_flags = tcp->th_flags;
 			break;
 		    }
 		case IPPROTO_UDP:
 			r.r_ports = *(uint32_t *)((caddr_t )ip + hlen);
 			break;
 		}
 
 	counter_u64_add(priv->nfinfo_packets, 1);
 	counter_u64_add(priv->nfinfo_bytes, plen);
 
 	/* Find hash slot. */
 	hsh = &priv->hash[ip_hash(&r)];
 
 	mtx_lock(&hsh->mtx);
 
 	/*
 	 * Go through hash and find our entry. If we encounter an
 	 * entry, that should be expired, purge it. We do a reverse
 	 * search since most active entries are first, and most
 	 * searches are done on most active entries.
 	 */
 	TAILQ_FOREACH_REVERSE_SAFE(fle, &hsh->head, fhead, fle_hash, fle1) {
 		if (bcmp(&r, &fle->f.r, sizeof(struct flow_rec)) == 0)
 			break;
 		if ((INACTIVE(fle) && SMALL(fle)) || AGED(fle)) {
 			TAILQ_REMOVE(&hsh->head, fle, fle_hash);
 			expire_flow(priv, priv_to_fib(priv, fle->f.r.fib),
 			    fle, NG_QUEUE);
 			counter_u64_add(priv->nfinfo_act_exp, 1);
 		}
 	}
 
 	if (fle) {			/* An existent entry. */
 
 		fle->f.bytes += plen;
 		fle->f.packets ++;
 		fle->f.tcp_flags |= tcp_flags;
 		fle->f.last = time_uptime;
 
 		/*
 		 * We have the following reasons to expire flow in active way:
 		 * - it hit active timeout
 		 * - a TCP connection closed
 		 * - it is going to overflow counter
 		 */
 		if (tcp_flags & TH_FIN || tcp_flags & TH_RST || AGED(fle) ||
 		    (fle->f.bytes >= (CNTR_MAX - IF_MAXMTU)) ) {
 			TAILQ_REMOVE(&hsh->head, fle, fle_hash);
 			expire_flow(priv, priv_to_fib(priv, fle->f.r.fib),
 			    fle, NG_QUEUE);
 			counter_u64_add(priv->nfinfo_act_exp, 1);
 		} else {
 			/*
 			 * It is the newest, move it to the tail,
 			 * if it isn't there already. Next search will
 			 * locate it quicker.
 			 */
 			if (fle != TAILQ_LAST(&hsh->head, fhead)) {
 				TAILQ_REMOVE(&hsh->head, fle, fle_hash);
 				TAILQ_INSERT_TAIL(&hsh->head, fle, fle_hash);
 			}
 		}
 	} else				/* A new flow entry. */
 		error = hash_insert(priv, hsh, &r, plen, flags, tcp_flags);
 
 	mtx_unlock(&hsh->mtx);
 
 	return (error);
 }
 #endif
 
 #ifdef INET6
 /* Insert IPv6 packet from into flow cache. */
 int
 ng_netflow_flow6_add(priv_p priv, fib_export_p fe, struct ip6_hdr *ip6,
     caddr_t upper_ptr, uint8_t upper_proto, uint8_t flags,
     unsigned int src_if_index)
 {
 	struct flow_entry	*fle = NULL, *fle1;
 	struct flow6_entry	*fle6;
 	struct flow_hash_entry	*hsh;
 	struct flow6_rec	r;
 	int			plen;
 	int			error = 0;
 	uint8_t			tcp_flags = 0;
 
 	/* check version */
 	if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION)
 		return (EINVAL);
 
 	bzero(&r, sizeof(r));
 
 	r.src.r_src6 = ip6->ip6_src;
 	r.dst.r_dst6 = ip6->ip6_dst;
 	r.fib = fe->fib;
 
 	/* Assume L4 template by default */
 	r.flow_type = NETFLOW_V9_FLOW_V6_L4;
 
 	plen = ntohs(ip6->ip6_plen) + sizeof(struct ip6_hdr);
 
 #if 0
 	/* XXX: set DSCP/CoS value */
 	r.r_tos = ip->ip_tos;
 #endif
 	if ((flags & NG_NETFLOW_IS_FRAG) == 0) {
 		switch(upper_proto) {
 		case IPPROTO_TCP:
 		    {
 			struct tcphdr *tcp;
 
 			tcp = (struct tcphdr *)upper_ptr;
 			r.r_ports = *(uint32_t *)upper_ptr;
 			tcp_flags = tcp->th_flags;
 			break;
 		    }
  		case IPPROTO_UDP:
 		case IPPROTO_SCTP:
 			r.r_ports = *(uint32_t *)upper_ptr;
 			break;
 		}
 	}	
 
 	r.r_ip_p = upper_proto;
 	r.r_i_ifx = src_if_index;
 
 	counter_u64_add(priv->nfinfo_packets6, 1);
 	counter_u64_add(priv->nfinfo_bytes6, plen);
 
 	/* Find hash slot. */
 	hsh = &priv->hash6[ip6_hash(&r)];
 
 	mtx_lock(&hsh->mtx);
 
 	/*
 	 * Go through hash and find our entry. If we encounter an
 	 * entry, that should be expired, purge it. We do a reverse
 	 * search since most active entries are first, and most
 	 * searches are done on most active entries.
 	 */
 	TAILQ_FOREACH_REVERSE_SAFE(fle, &hsh->head, fhead, fle_hash, fle1) {
 		if (fle->f.version != IP6VERSION)
 			continue;
 		fle6 = (struct flow6_entry *)fle;
 		if (bcmp(&r, &fle6->f.r, sizeof(struct flow6_rec)) == 0)
 			break;
 		if ((INACTIVE(fle6) && SMALL(fle6)) || AGED(fle6)) {
 			TAILQ_REMOVE(&hsh->head, fle, fle_hash);
 			expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), fle,
 			    NG_QUEUE);
 			counter_u64_add(priv->nfinfo_act_exp, 1);
 		}
 	}
 
 	if (fle != NULL) {			/* An existent entry. */
 		fle6 = (struct flow6_entry *)fle;
 
 		fle6->f.bytes += plen;
 		fle6->f.packets ++;
 		fle6->f.tcp_flags |= tcp_flags;
 		fle6->f.last = time_uptime;
 
 		/*
 		 * We have the following reasons to expire flow in active way:
 		 * - it hit active timeout
 		 * - a TCP connection closed
 		 * - it is going to overflow counter
 		 */
 		if (tcp_flags & TH_FIN || tcp_flags & TH_RST || AGED(fle6) ||
 		    (fle6->f.bytes >= (CNTR_MAX - IF_MAXMTU)) ) {
 			TAILQ_REMOVE(&hsh->head, fle, fle_hash);
 			expire_flow(priv, priv_to_fib(priv, fle->f.r.fib), fle,
 			    NG_QUEUE);
 			counter_u64_add(priv->nfinfo_act_exp, 1);
 		} else {
 			/*
 			 * It is the newest, move it to the tail,
 			 * if it isn't there already. Next search will
 			 * locate it quicker.
 			 */
 			if (fle != TAILQ_LAST(&hsh->head, fhead)) {
 				TAILQ_REMOVE(&hsh->head, fle, fle_hash);
 				TAILQ_INSERT_TAIL(&hsh->head, fle, fle_hash);
 			}
 		}
 	} else				/* A new flow entry. */
 		error = hash6_insert(priv, hsh, &r, plen, flags, tcp_flags);
 
 	mtx_unlock(&hsh->mtx);
 
 	return (error);
 }
 #endif
 
 /*
  * Return records from cache to userland.
  *
  * TODO: matching particular IP should be done in kernel, here.
  */
 int
 ng_netflow_flow_show(priv_p priv, struct ngnf_show_header *req,
 struct ngnf_show_header *resp)
 {
 	struct flow_hash_entry	*hsh;
 	struct flow_entry	*fle;
 	struct flow_entry_data	*data = (struct flow_entry_data *)(resp + 1);
 #ifdef INET6
 	struct flow6_entry_data	*data6 = (struct flow6_entry_data *)(resp + 1);
 #endif
 	int	i, max;
 
 	i = req->hash_id;
 	if (i > NBUCKETS-1)
 		return (EINVAL);
 
 #ifdef INET6
 	if (req->version == 6) {
 		resp->version = 6;
 		hsh = priv->hash6 + i;
 		max = NREC6_AT_ONCE;
 	} else
 #endif
 	if (req->version == 4) {
 		resp->version = 4;
 		hsh = priv->hash + i;
 		max = NREC_AT_ONCE;
 	} else
 		return (EINVAL);
 
 	/*
 	 * We will transfer not more than NREC_AT_ONCE. More data
 	 * will come in next message.
 	 * We send current hash index and current record number in list 
 	 * to userland, and userland should return it back to us. 
 	 * Then, we will restart with new entry.
 	 *
 	 * The resulting cache snapshot can be inaccurate if flow expiration
 	 * is taking place on hash item between userland data requests for 
 	 * this hash item id.
 	 */
 	resp->nentries = 0;
 	for (; i < NBUCKETS; hsh++, i++) {
 		int list_id;
 
 		if (mtx_trylock(&hsh->mtx) == 0) {
 			/* 
 			 * Requested hash index is not available,
 			 * relay decision to skip or re-request data
 			 * to userland.
 			 */
 			resp->hash_id = i;
 			resp->list_id = 0;
 			return (0);
 		}
 
 		list_id = 0;
 		TAILQ_FOREACH(fle, &hsh->head, fle_hash) {
 			if (hsh->mtx.mtx_lock & MTX_CONTESTED) {
 				resp->hash_id = i;
 				resp->list_id = list_id;
 				mtx_unlock(&hsh->mtx);
 				return (0);
 			}
 
 			list_id++;
 			/* Search for particular record in list. */
 			if (req->list_id > 0) {
 				if (list_id < req->list_id)
 					continue;
 
 				/* Requested list position found. */
 				req->list_id = 0;
 			}
 #ifdef INET6
 			if (req->version == 6) {
 				struct flow6_entry *fle6;
 
 				fle6 = (struct flow6_entry *)fle;
 				bcopy(&fle6->f, data6 + resp->nentries,
 				    sizeof(fle6->f));
 			} else
 #endif
 				bcopy(&fle->f, data + resp->nentries,
 				    sizeof(fle->f));
 			resp->nentries++;
 			if (resp->nentries == max) {
 				resp->hash_id = i;
 				/* 
 				 * If it was the last item in list
 				 * we simply skip to next hash_id.
 				 */
 				resp->list_id = list_id + 1;
 				mtx_unlock(&hsh->mtx);
 				return (0);
 			}
 		}
 		mtx_unlock(&hsh->mtx);
 	}
 
 	resp->hash_id = resp->list_id = 0;
 
 	return (0);
 }
 
 /* We have full datagram in privdata. Send it to export hook. */
 static int
 export_send(priv_p priv, fib_export_p fe, item_p item, int flags)
 {
 	struct mbuf *m = NGI_M(item);
 	struct netflow_v5_export_dgram *dgram = mtod(m,
 					struct netflow_v5_export_dgram *);
 	struct netflow_v5_header *header = &dgram->header;
 	struct timespec ts;
 	int error = 0;
 
 	/* Fill mbuf header. */
 	m->m_len = m->m_pkthdr.len = sizeof(struct netflow_v5_record) *
 	   header->count + sizeof(struct netflow_v5_header);
 
 	/* Fill export header. */
 	header->sys_uptime = htonl(MILLIUPTIME(time_uptime));
 	getnanotime(&ts);
 	header->unix_secs  = htonl(ts.tv_sec);
 	header->unix_nsecs = htonl(ts.tv_nsec);
 	header->engine_type = 0;
 	header->engine_id = fe->domain_id;
 	header->pad = 0;
 	header->flow_seq = htonl(atomic_fetchadd_32(&fe->flow_seq,
 	    header->count));
 	header->count = htons(header->count);
 
 	if (priv->export != NULL)
 		NG_FWD_ITEM_HOOK_FLAGS(error, item, priv->export, flags);
 	else
 		NG_FREE_ITEM(item);
 
 	return (error);
 }
 
 /* Add export record to dgram. */
 static int
 export_add(item_p item, struct flow_entry *fle)
 {
 	struct netflow_v5_export_dgram *dgram = mtod(NGI_M(item),
 					struct netflow_v5_export_dgram *);
 	struct netflow_v5_header *header = &dgram->header;
 	struct netflow_v5_record *rec;
 
 	rec = &dgram->r[header->count];
 	header->count ++;
 
 	KASSERT(header->count <= NETFLOW_V5_MAX_RECORDS,
 	    ("ng_netflow: export too big"));
 
 	/* Fill in export record. */
 	rec->src_addr = fle->f.r.r_src.s_addr;
 	rec->dst_addr = fle->f.r.r_dst.s_addr;
 	rec->next_hop = fle->f.next_hop.s_addr;
 	rec->i_ifx    = htons(fle->f.fle_i_ifx);
 	rec->o_ifx    = htons(fle->f.fle_o_ifx);
 	rec->packets  = htonl(fle->f.packets);
 	rec->octets   = htonl(fle->f.bytes);
 	rec->first    = htonl(MILLIUPTIME(fle->f.first));
 	rec->last     = htonl(MILLIUPTIME(fle->f.last));
 	rec->s_port   = fle->f.r.r_sport;
 	rec->d_port   = fle->f.r.r_dport;
 	rec->flags    = fle->f.tcp_flags;
 	rec->prot     = fle->f.r.r_ip_p;
 	rec->tos      = fle->f.r.r_tos;
 	rec->dst_mask = fle->f.dst_mask;
 	rec->src_mask = fle->f.src_mask;
 	rec->pad1     = 0;
 	rec->pad2     = 0;
 
 	/* Not supported fields. */
 	rec->src_as = rec->dst_as = 0;
 
 	if (header->count == NETFLOW_V5_MAX_RECORDS)
 		return (1); /* end of datagram */
 	else
 		return (0);	
 }
 
 /* Periodic flow expiry run. */
 void
 ng_netflow_expire(void *arg)
 {
 	struct flow_entry	*fle, *fle1;
 	struct flow_hash_entry	*hsh;
 	priv_p			priv = (priv_p )arg;
 	int			used, i;
 
 	/*
 	 * Going through all the cache.
 	 */
 	used = uma_zone_get_cur(priv->zone);
 	for (hsh = priv->hash, i = 0; i < NBUCKETS; hsh++, i++) {
 		/*
 		 * Skip entries, that are already being worked on.
 		 */
 		if (mtx_trylock(&hsh->mtx) == 0)
 			continue;
 
 		TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) {
 			/*
 			 * Interrupt thread wants this entry!
 			 * Quick! Quick! Bail out!
 			 */
 			if (hsh->mtx.mtx_lock & MTX_CONTESTED)
 				break;
 
 			/*
 			 * Don't expire aggressively while hash collision
 			 * ratio is predicted small.
 			 */
 			if (used <= (NBUCKETS*2) && !INACTIVE(fle))
 				break;
 
 			if ((INACTIVE(fle) && (SMALL(fle) ||
 			    (used > (NBUCKETS*2)))) || AGED(fle)) {
 				TAILQ_REMOVE(&hsh->head, fle, fle_hash);
 				expire_flow(priv, priv_to_fib(priv,
 				    fle->f.r.fib), fle, NG_NOFLAGS);
 				used--;
 				counter_u64_add(priv->nfinfo_inact_exp, 1);
 			}
 		}
 		mtx_unlock(&hsh->mtx);
 	}
 
 #ifdef INET6
 	used = uma_zone_get_cur(priv->zone6);
 	for (hsh = priv->hash6, i = 0; i < NBUCKETS; hsh++, i++) {
 		struct flow6_entry	*fle6;
 
 		/*
 		 * Skip entries, that are already being worked on.
 		 */
 		if (mtx_trylock(&hsh->mtx) == 0)
 			continue;
 
 		TAILQ_FOREACH_SAFE(fle, &hsh->head, fle_hash, fle1) {
 			fle6 = (struct flow6_entry *)fle;
 			/*
 			 * Interrupt thread wants this entry!
 			 * Quick! Quick! Bail out!
 			 */
 			if (hsh->mtx.mtx_lock & MTX_CONTESTED)
 				break;
 
 			/*
 			 * Don't expire aggressively while hash collision
 			 * ratio is predicted small.
 			 */
 			if (used <= (NBUCKETS*2) && !INACTIVE(fle6))
 				break;
 
 			if ((INACTIVE(fle6) && (SMALL(fle6) ||
 			    (used > (NBUCKETS*2)))) || AGED(fle6)) {
 				TAILQ_REMOVE(&hsh->head, fle, fle_hash);
 				expire_flow(priv, priv_to_fib(priv,
 				    fle->f.r.fib), fle, NG_NOFLAGS);
 				used--;
 				counter_u64_add(priv->nfinfo_inact_exp, 1);
 			}
 		}
 		mtx_unlock(&hsh->mtx);
 	}
 #endif
 
 	/* Schedule next expire. */
 	callout_reset(&priv->exp_callout, (1*hz), &ng_netflow_expire,
 	    (void *)priv);
 }
diff --git a/sys/netgraph/netflow/ng_netflow.c b/sys/netgraph/netflow/ng_netflow.c
index ab0aff5cf7fc..fbb232b327d8 100644
--- a/sys/netgraph/netflow/ng_netflow.c
+++ b/sys/netgraph/netflow/ng_netflow.c
@@ -1,1065 +1,1066 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010-2011 Alexander V. Chernikov <melifaro@ipfw.ru>
  * Copyright (c) 2004-2005 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright (c) 2001-2003 Roman V. Palagin <romanp@unshadow.net>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $SourceForge: ng_netflow.c,v 1.30 2004/09/05 11:37:43 glebius Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 #include <sys/ctype.h>
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 #include <net/if_arp.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_vlan_var.h>
 #include <net/bpf.h>
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet/sctp.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/ng_parse.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/netflow/netflow.h>
 #include <netgraph/netflow/netflow_v9.h>
 #include <netgraph/netflow/ng_netflow.h>
 
 /* Netgraph methods */
 static ng_constructor_t	ng_netflow_constructor;
 static ng_rcvmsg_t	ng_netflow_rcvmsg;
 static ng_close_t	ng_netflow_close;
 static ng_shutdown_t	ng_netflow_rmnode;
 static ng_newhook_t	ng_netflow_newhook;
 static ng_rcvdata_t	ng_netflow_rcvdata;
 static ng_disconnect_t	ng_netflow_disconnect;
 
 /* Parse type for struct ng_netflow_info */
 static const struct ng_parse_struct_field ng_netflow_info_type_fields[]
 	= NG_NETFLOW_INFO_TYPE;
 static const struct ng_parse_type ng_netflow_info_type = {
 	&ng_parse_struct_type,
 	&ng_netflow_info_type_fields
 };
 
 /*  Parse type for struct ng_netflow_ifinfo */
 static const struct ng_parse_struct_field ng_netflow_ifinfo_type_fields[]
 	= NG_NETFLOW_IFINFO_TYPE;
 static const struct ng_parse_type ng_netflow_ifinfo_type = {
 	&ng_parse_struct_type,
 	&ng_netflow_ifinfo_type_fields
 };
 
 /* Parse type for struct ng_netflow_setdlt */
 static const struct ng_parse_struct_field ng_netflow_setdlt_type_fields[]
 	= NG_NETFLOW_SETDLT_TYPE;
 static const struct ng_parse_type ng_netflow_setdlt_type = {
 	&ng_parse_struct_type,
 	&ng_netflow_setdlt_type_fields
 };
 
 /* Parse type for ng_netflow_setifindex */
 static const struct ng_parse_struct_field ng_netflow_setifindex_type_fields[]
 	= NG_NETFLOW_SETIFINDEX_TYPE;
 static const struct ng_parse_type ng_netflow_setifindex_type = {
 	&ng_parse_struct_type,
 	&ng_netflow_setifindex_type_fields
 };
 
 /* Parse type for ng_netflow_settimeouts */
 static const struct ng_parse_struct_field ng_netflow_settimeouts_type_fields[]
 	= NG_NETFLOW_SETTIMEOUTS_TYPE;
 static const struct ng_parse_type ng_netflow_settimeouts_type = {
 	&ng_parse_struct_type,
 	&ng_netflow_settimeouts_type_fields
 };
 
 /* Parse type for ng_netflow_setconfig */
 static const struct ng_parse_struct_field ng_netflow_setconfig_type_fields[]
 	= NG_NETFLOW_SETCONFIG_TYPE;
 static const struct ng_parse_type ng_netflow_setconfig_type = {
 	&ng_parse_struct_type,
 	&ng_netflow_setconfig_type_fields
 };
 
 /* Parse type for ng_netflow_settemplate */
 static const struct ng_parse_struct_field ng_netflow_settemplate_type_fields[]
 	= NG_NETFLOW_SETTEMPLATE_TYPE;
 static const struct ng_parse_type ng_netflow_settemplate_type = {
 	&ng_parse_struct_type,
 	&ng_netflow_settemplate_type_fields
 };
 
 /* Parse type for ng_netflow_setmtu */
 static const struct ng_parse_struct_field ng_netflow_setmtu_type_fields[]
 	= NG_NETFLOW_SETMTU_TYPE;
 static const struct ng_parse_type ng_netflow_setmtu_type = {
 	&ng_parse_struct_type,
 	&ng_netflow_setmtu_type_fields
 };
 
 /* Parse type for struct ng_netflow_v9info */
 static const struct ng_parse_struct_field ng_netflow_v9info_type_fields[]
 	= NG_NETFLOW_V9INFO_TYPE;
 static const struct ng_parse_type ng_netflow_v9info_type = {
 	&ng_parse_struct_type,
 	&ng_netflow_v9info_type_fields
 };
 
 /* List of commands and how to convert arguments to/from ASCII */
 static const struct ng_cmdlist ng_netflow_cmds[] = {
        {
 	 NGM_NETFLOW_COOKIE,
 	 NGM_NETFLOW_INFO,
 	 "info",
 	 NULL,
 	 &ng_netflow_info_type
        },
        {
 	NGM_NETFLOW_COOKIE,
 	NGM_NETFLOW_IFINFO,
 	"ifinfo",
 	&ng_parse_uint16_type,
 	&ng_netflow_ifinfo_type
        },
        {
 	NGM_NETFLOW_COOKIE,
 	NGM_NETFLOW_SETDLT,
 	"setdlt",
 	&ng_netflow_setdlt_type,
 	NULL
        },
        {
 	NGM_NETFLOW_COOKIE,
 	NGM_NETFLOW_SETIFINDEX,
 	"setifindex",
 	&ng_netflow_setifindex_type,
 	NULL
        },
        {
 	NGM_NETFLOW_COOKIE,
 	NGM_NETFLOW_SETTIMEOUTS,
 	"settimeouts",
 	&ng_netflow_settimeouts_type,
 	NULL
        },
        {
 	NGM_NETFLOW_COOKIE,
 	NGM_NETFLOW_SETCONFIG,
 	"setconfig",
 	&ng_netflow_setconfig_type,
 	NULL
        },
        {
 	NGM_NETFLOW_COOKIE,
 	NGM_NETFLOW_SETTEMPLATE,
 	"settemplate",
 	&ng_netflow_settemplate_type,
 	NULL
        },
        {
 	NGM_NETFLOW_COOKIE,
 	NGM_NETFLOW_SETMTU,
 	"setmtu",
 	&ng_netflow_setmtu_type,
 	NULL
        },
        {
 	 NGM_NETFLOW_COOKIE,
 	 NGM_NETFLOW_V9INFO,
 	 "v9info",
 	 NULL,
 	 &ng_netflow_v9info_type
        },
        { 0 }
 };
 
 /* Netgraph node type descriptor */
 static struct ng_type ng_netflow_typestruct = {
 	.version =	NG_ABI_VERSION,
 	.name =		NG_NETFLOW_NODE_TYPE,
 	.constructor =	ng_netflow_constructor,
 	.rcvmsg =	ng_netflow_rcvmsg,
 	.close =	ng_netflow_close,
 	.shutdown =	ng_netflow_rmnode,
 	.newhook =	ng_netflow_newhook,
 	.rcvdata =	ng_netflow_rcvdata,
 	.disconnect =	ng_netflow_disconnect,
 	.cmdlist =	ng_netflow_cmds,
 };
 NETGRAPH_INIT(netflow, &ng_netflow_typestruct);
 
 /* Called at node creation */
 static int
 ng_netflow_constructor(node_p node)
 {
 	priv_p priv;
 	int i;
 
 	/* Initialize private data */
 	priv = malloc(sizeof(*priv), M_NETGRAPH, M_WAITOK | M_ZERO);
 
 	/* Initialize fib data */
 	priv->maxfibs = rt_numfibs;
 	priv->fib_data = malloc(sizeof(fib_export_p) * priv->maxfibs,
 	    M_NETGRAPH, M_WAITOK | M_ZERO);
 
 	/* Make node and its data point at each other */
 	NG_NODE_SET_PRIVATE(node, priv);
 	priv->node = node;
 
 	/* Initialize timeouts to default values */
 	priv->nfinfo_inact_t = INACTIVE_TIMEOUT;
 	priv->nfinfo_act_t = ACTIVE_TIMEOUT;
 
 	/* Set default config */
 	for (i = 0; i < NG_NETFLOW_MAXIFACES; i++)
 		priv->ifaces[i].info.conf = NG_NETFLOW_CONF_INGRESS;
 
 	/* Initialize callout handle */
 	callout_init(&priv->exp_callout, 1);
 
 	/* Allocate memory and set up flow cache */
 	ng_netflow_cache_init(priv);
 
 	return (0);
 }
 
 /*
  * ng_netflow supports two hooks: data and export.
  * Incoming traffic is expected on data, and expired
  * netflow datagrams are sent to export.
  */
 static int
 ng_netflow_newhook(node_p node, hook_p hook, const char *name)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	if (strncmp(name, NG_NETFLOW_HOOK_DATA,	/* an iface hook? */
 	    strlen(NG_NETFLOW_HOOK_DATA)) == 0) {
 		iface_p iface;
 		int ifnum = -1;
 		const char *cp;
 		char *eptr;
 
 		cp = name + strlen(NG_NETFLOW_HOOK_DATA);
 		if (!isdigit(*cp) || (cp[0] == '0' && cp[1] != '\0'))
 			return (EINVAL);
 
 		ifnum = (int)strtoul(cp, &eptr, 10);
 		if (*eptr != '\0' || ifnum < 0 || ifnum >= NG_NETFLOW_MAXIFACES)
 			return (EINVAL);
 
 		/* See if hook is already connected */
 		if (priv->ifaces[ifnum].hook != NULL)
 			return (EISCONN);
 
 		iface = &priv->ifaces[ifnum];
 
 		/* Link private info and hook together */
 		NG_HOOK_SET_PRIVATE(hook, iface);
 		iface->hook = hook;
 
 		/*
 		 * In most cases traffic accounting is done on an
 		 * Ethernet interface, so default data link type
 		 * will be DLT_EN10MB.
 		 */
 		iface->info.ifinfo_dlt = DLT_EN10MB;
 
 	} else if (strncmp(name, NG_NETFLOW_HOOK_OUT,
 	    strlen(NG_NETFLOW_HOOK_OUT)) == 0) {
 		iface_p iface;
 		int ifnum = -1;
 		const char *cp;
 		char *eptr;
 
 		cp = name + strlen(NG_NETFLOW_HOOK_OUT);
 		if (!isdigit(*cp) || (cp[0] == '0' && cp[1] != '\0'))
 			return (EINVAL);
 
 		ifnum = (int)strtoul(cp, &eptr, 10);
 		if (*eptr != '\0' || ifnum < 0 || ifnum >= NG_NETFLOW_MAXIFACES)
 			return (EINVAL);
 
 		/* See if hook is already connected */
 		if (priv->ifaces[ifnum].out != NULL)
 			return (EISCONN);
 
 		iface = &priv->ifaces[ifnum];
 
 		/* Link private info and hook together */
 		NG_HOOK_SET_PRIVATE(hook, iface);
 		iface->out = hook;
 
 	} else if (strcmp(name, NG_NETFLOW_HOOK_EXPORT) == 0) {
 		if (priv->export != NULL)
 			return (EISCONN);
 
 		/* Netflow version 5 supports 32-bit counters only */
 		if (CNTR_MAX == UINT64_MAX)
 			return (EINVAL);
 
 		priv->export = hook;
 
 		/* Exporter is ready. Let's schedule expiry. */
 		callout_reset(&priv->exp_callout, (1*hz), &ng_netflow_expire,
 		    (void *)priv);
 	} else if (strcmp(name, NG_NETFLOW_HOOK_EXPORT9) == 0) {
 		if (priv->export9 != NULL)
 			return (EISCONN);
 
 		priv->export9 = hook;
 
 		/* Exporter is ready. Let's schedule expiry. */
 		callout_reset(&priv->exp_callout, (1*hz), &ng_netflow_expire,
 		    (void *)priv);
 	} else
 		return (EINVAL);
 
 	return (0);
 }
 
 /* Get a netgraph control message. */
 static int
 ng_netflow_rcvmsg (node_p node, item_p item, hook_p lasthook)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ng_mesg *resp = NULL;
 	int error = 0;
 	struct ng_mesg *msg;
 
 	NGI_GET_MSG(item, msg);
 
 	/* Deal with message according to cookie and command */
 	switch (msg->header.typecookie) {
 	case NGM_NETFLOW_COOKIE:
 		switch (msg->header.cmd) {
 		case NGM_NETFLOW_INFO:
 		    {
 			struct ng_netflow_info *i;
 
 			NG_MKRESPONSE(resp, msg, sizeof(struct ng_netflow_info),
 			    M_NOWAIT);
 			i = (struct ng_netflow_info *)resp->data;
 			ng_netflow_copyinfo(priv, i);
 
 			break;
 		    }
 		case NGM_NETFLOW_IFINFO:
 		    {
 			struct ng_netflow_ifinfo *i;
 			const uint16_t *index;
 
 			if (msg->header.arglen != sizeof(uint16_t))
 				 ERROUT(EINVAL);
 
 			index  = (uint16_t *)msg->data;
 			if (*index >= NG_NETFLOW_MAXIFACES)
 				ERROUT(EINVAL);
 
 			/* connected iface? */
 			if (priv->ifaces[*index].hook == NULL)
 				 ERROUT(EINVAL);
 
 			NG_MKRESPONSE(resp, msg,
 			     sizeof(struct ng_netflow_ifinfo), M_NOWAIT);
 			i = (struct ng_netflow_ifinfo *)resp->data;
 			memcpy((void *)i, (void *)&priv->ifaces[*index].info,
 			    sizeof(priv->ifaces[*index].info));
 
 			break;
 		    }
 		case NGM_NETFLOW_SETDLT:
 		    {
 			struct ng_netflow_setdlt *set;
 			struct ng_netflow_iface *iface;
 
 			if (msg->header.arglen !=
 			    sizeof(struct ng_netflow_setdlt))
 				ERROUT(EINVAL);
 
 			set = (struct ng_netflow_setdlt *)msg->data;
 			if (set->iface >= NG_NETFLOW_MAXIFACES)
 				ERROUT(EINVAL);
 			iface = &priv->ifaces[set->iface];
 
 			/* connected iface? */
 			if (iface->hook == NULL)
 				ERROUT(EINVAL);
 
 			switch (set->dlt) {
 			case	DLT_EN10MB:
 				iface->info.ifinfo_dlt = DLT_EN10MB;
 				break;
 			case	DLT_RAW:
 				iface->info.ifinfo_dlt = DLT_RAW;
 				break;
 			default:
 				ERROUT(EINVAL);
 			}
 			break;
 		    }
 		case NGM_NETFLOW_SETIFINDEX:
 		    {
 			struct ng_netflow_setifindex *set;
 			struct ng_netflow_iface *iface;
 
 			if (msg->header.arglen !=
 			    sizeof(struct ng_netflow_setifindex))
 				ERROUT(EINVAL);
 
 			set = (struct ng_netflow_setifindex *)msg->data;
 			if (set->iface >= NG_NETFLOW_MAXIFACES)
 				ERROUT(EINVAL);
 			iface = &priv->ifaces[set->iface];
 
 			/* connected iface? */
 			if (iface->hook == NULL)
 				ERROUT(EINVAL);
 
 			iface->info.ifinfo_index = set->index;
 
 			break;
 		    }
 		case NGM_NETFLOW_SETTIMEOUTS:
 		    {
 			struct ng_netflow_settimeouts *set;
 
 			if (msg->header.arglen !=
 			    sizeof(struct ng_netflow_settimeouts))
 				ERROUT(EINVAL);
 
 			set = (struct ng_netflow_settimeouts *)msg->data;
 
 			priv->nfinfo_inact_t = set->inactive_timeout;
 			priv->nfinfo_act_t = set->active_timeout;
 
 			break;
 		    }
 		case NGM_NETFLOW_SETCONFIG:
 		    {
 			struct ng_netflow_setconfig *set;
 
 			if (msg->header.arglen !=
 			    sizeof(struct ng_netflow_setconfig))
 				ERROUT(EINVAL);
 
 			set = (struct ng_netflow_setconfig *)msg->data;
 
 			if (set->iface >= NG_NETFLOW_MAXIFACES)
 				ERROUT(EINVAL);
 			
 			priv->ifaces[set->iface].info.conf = set->conf;
 
 			break;
 		    }
 		case NGM_NETFLOW_SETTEMPLATE:
 		    {
 			struct ng_netflow_settemplate *set;
 
 			if (msg->header.arglen !=
 			    sizeof(struct ng_netflow_settemplate))
 				ERROUT(EINVAL);
 
 			set = (struct ng_netflow_settemplate *)msg->data;
 
 			priv->templ_packets = set->packets;
 			priv->templ_time = set->time;
 
 			break;
 		    }
 		case NGM_NETFLOW_SETMTU:
 		    {
 			struct ng_netflow_setmtu *set;
 
 			if (msg->header.arglen !=
 			    sizeof(struct ng_netflow_setmtu))
 				ERROUT(EINVAL);
 
 			set = (struct ng_netflow_setmtu *)msg->data;
 			if ((set->mtu < MIN_MTU) || (set->mtu > MAX_MTU))
 				ERROUT(EINVAL);
 
 			priv->mtu = set->mtu;
 
 			break;
 		    }
 		case NGM_NETFLOW_SHOW:
 			if (msg->header.arglen !=
 			    sizeof(struct ngnf_show_header))
 				ERROUT(EINVAL);
 
 			NG_MKRESPONSE(resp, msg, NGRESP_SIZE, M_NOWAIT);
 
 			if (!resp)
 				ERROUT(ENOMEM);
 
 			error = ng_netflow_flow_show(priv,
 			    (struct ngnf_show_header *)msg->data,
 			    (struct ngnf_show_header *)resp->data);
 
 			if (error)
 				NG_FREE_MSG(resp);
 
 			break;
 		case NGM_NETFLOW_V9INFO:
 		    {
 			struct ng_netflow_v9info *i;
 
 			NG_MKRESPONSE(resp, msg,
 			    sizeof(struct ng_netflow_v9info), M_NOWAIT);
 			i = (struct ng_netflow_v9info *)resp->data;
 			ng_netflow_copyv9info(priv, i);
 
 			break;
 		    }
 		default:
 			ERROUT(EINVAL);		/* unknown command */
 			break;
 		}
 		break;
 	default:
 		ERROUT(EINVAL);		/* incorrect cookie */
 		break;
 	}
 
 	/*
 	 * Take care of synchronous response, if any.
 	 * Free memory and return.
 	 */
 done:
 	NG_RESPOND_MSG(error, node, item, resp);
 	NG_FREE_MSG(msg);
 
 	return (error);
 }
 
 /* Receive data on hook. */
 static int
 ng_netflow_rcvdata (hook_p hook, item_p item)
 {
 	const node_p node = NG_HOOK_NODE(hook);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	const iface_p iface = NG_HOOK_PRIVATE(hook);
 	hook_p out;
 	struct mbuf *m = NULL, *m_old = NULL;
 	struct ip *ip = NULL;
 	struct ip6_hdr *ip6 = NULL;
 	struct m_tag *mtag;
 	int pullup_len = 0;
 	uint8_t acct = 0, bypass = 0;
 	int error = 0, l3_off = 0;
 #if defined(INET) || defined(INET6)
 	int off;
 	uint8_t flags = 0, upper_proto = 0;
 	unsigned int src_if_index;
 	caddr_t upper_ptr = NULL;
 #endif
 	fib_export_p fe;	
 	uint32_t fib;
 
 	if ((hook == priv->export) || (hook == priv->export9)) {
 		/*
 		 * Data arrived on export hook.
 		 * This must not happen.
 		 */
 		log(LOG_ERR, "ng_netflow: incoming data on export hook!\n");
 		ERROUT(EINVAL);
 	}
 
 	if (hook == iface->hook) {
 		if ((iface->info.conf & NG_NETFLOW_CONF_INGRESS) == 0)
 			bypass = 1;
 		out = iface->out;
 	} else if (hook == iface->out) {
 		if ((iface->info.conf & NG_NETFLOW_CONF_EGRESS) == 0)
 			bypass = 1;
 		out = iface->hook;
 	} else
 		ERROUT(EINVAL);
 
 	if ((!bypass) && (iface->info.conf &
 	    (NG_NETFLOW_CONF_ONCE | NG_NETFLOW_CONF_THISONCE))) {
 		mtag = m_tag_locate(NGI_M(item), MTAG_NETFLOW,
 		    MTAG_NETFLOW_CALLED, NULL);
 		while (mtag != NULL) {
 			if ((iface->info.conf & NG_NETFLOW_CONF_ONCE) ||
 			    ((ng_ID_t *)(mtag + 1))[0] == NG_NODE_ID(node)) {
 				bypass = 1;
 				break;
 			}
 			mtag = m_tag_locate(NGI_M(item), MTAG_NETFLOW,
 			    MTAG_NETFLOW_CALLED, mtag);
 		}
 	}
 
 	if (bypass) {
 		if (out == NULL)
 			ERROUT(ENOTCONN);
 
 		NG_FWD_ITEM_HOOK(error, item, out);
 		return (error);
 	}
 
 	if (iface->info.conf &
 	    (NG_NETFLOW_CONF_ONCE | NG_NETFLOW_CONF_THISONCE)) {
 		mtag = m_tag_alloc(MTAG_NETFLOW, MTAG_NETFLOW_CALLED,
 		    sizeof(ng_ID_t), M_NOWAIT);
 		if (mtag) {
 			((ng_ID_t *)(mtag + 1))[0] = NG_NODE_ID(node);
 			m_tag_prepend(NGI_M(item), mtag);
 		}
 	}
 
 #if defined(INET) || defined(INET6)
 	/* Import configuration flags related to flow creation */
 	flags = iface->info.conf & NG_NETFLOW_FLOW_FLAGS;
 #endif
 
 	NGI_GET_M(item, m);
 	m_old = m;
 
 	/* Increase counters. */
 	iface->info.ifinfo_packets++;
 
 	/*
 	 * Depending on interface data link type and packet contents
 	 * we pullup enough data, so that ng_netflow_flow_add() does not
 	 * need to know about mbuf at all. We keep current length of data
 	 * needed to be contiguous in pullup_len. mtod() is done at the
 	 * very end one more time, since m can had changed after pulluping.
 	 *
 	 * In case of unrecognized data we don't return error, but just
 	 * pass data to downstream hook, if it is available.
 	 */
 
 #define	M_CHECK(length)	do {					\
 	pullup_len += length;					\
 	if (((m)->m_pkthdr.len < (pullup_len)) ||		\
 	   ((pullup_len) > MHLEN)) {				\
 		error = EINVAL;					\
 		goto bypass;					\
 	} 							\
 	if ((m)->m_len < (pullup_len) &&			\
 	   (((m) = m_pullup((m),(pullup_len))) == NULL)) {	\
 		error = ENOBUFS;				\
 		goto done;					\
 	}							\
 } while (0)
 
 	switch (iface->info.ifinfo_dlt) {
 	case DLT_EN10MB:	/* Ethernet */
 	    {
 		struct ether_header *eh;
 		uint16_t etype;
 
 		M_CHECK(sizeof(struct ether_header));
 		eh = mtod(m, struct ether_header *);
 
 		/* Make sure this is IP frame. */
 		etype = ntohs(eh->ether_type);
 		switch (etype) {
 #ifdef INET
 		case ETHERTYPE_IP:
 			M_CHECK(sizeof(struct ip));
 			eh = mtod(m, struct ether_header *);
 			ip = (struct ip *)(eh + 1);
 			l3_off = sizeof(struct ether_header);
 			break;
 #endif
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 			/*
 			 * m_pullup() called by M_CHECK() pullups
 			 * kern.ipc.max_protohdr (default 60 bytes)
 			 * which is enough.
 			 */
 			M_CHECK(sizeof(struct ip6_hdr));
 			eh = mtod(m, struct ether_header *);
 			ip6 = (struct ip6_hdr *)(eh + 1);
 			l3_off = sizeof(struct ether_header);
 			break;
 #endif
 		case ETHERTYPE_VLAN:
 		    {
 			struct ether_vlan_header *evh;
 
 			M_CHECK(sizeof(struct ether_vlan_header) -
 			    sizeof(struct ether_header));
 			evh = mtod(m, struct ether_vlan_header *);
 			etype = ntohs(evh->evl_proto);
 			l3_off = sizeof(struct ether_vlan_header);
 
 			if (etype == ETHERTYPE_IP) {
 #ifdef INET
 				M_CHECK(sizeof(struct ip));
 				ip = (struct ip *)(evh + 1);
 				break;
 #endif
 #ifdef INET6
 			} else if (etype == ETHERTYPE_IPV6) {
 				M_CHECK(sizeof(struct ip6_hdr));
 				ip6 = (struct ip6_hdr *)(evh + 1);
 				break;
 #endif
 			}
 		    }
 		default:
 			goto bypass;	/* pass this frame */
 		}
 		break;
 	    }
 	case DLT_RAW:		/* IP packets */
 		M_CHECK(sizeof(struct ip));
 		ip = mtod(m, struct ip *);
 		/* l3_off is already zero */
 #ifdef INET6
 		/*
 		 * If INET6 is not defined IPv6 packets
 		 * will be discarded in ng_netflow_flow_add().
 		 */
 		if (ip->ip_v == IP6VERSION) {
 			ip = NULL;
 			M_CHECK(sizeof(struct ip6_hdr) - sizeof(struct ip));
 			ip6 = mtod(m, struct ip6_hdr *);
 		}
 #endif
 #ifndef INET
 		ip = NULL;
 #endif
 		break;
 	default:
 		goto bypass;
 		break;
 	}
 
 #if defined(INET) || defined(INET6)
 	off = pullup_len;
 #endif
 
 	if ((ip != NULL) && ((ip->ip_off & htons(IP_OFFMASK)) == 0)) {
 		if ((ip->ip_v != IPVERSION) ||
 		    ((ip->ip_hl << 2) < sizeof(struct ip)))
 			goto bypass;
 		/*
 		 * In case of IPv4 header with options, we haven't pulled
 		 * up enough, yet.
 		 */
 		M_CHECK((ip->ip_hl << 2) - sizeof(struct ip));
 
 #if defined(INET) || defined(INET6)
 		/* Save upper layer offset and proto */
 		off = pullup_len;
 		upper_proto = ip->ip_p;
 #endif
 
 		/*
 		 * XXX: in case of wrong upper layer header we will
 		 * forward this packet but skip this record in netflow.
 		 */
 		switch (ip->ip_p) {
 		case IPPROTO_TCP:
 			M_CHECK(sizeof(struct tcphdr));
 			break;
 		case IPPROTO_UDP:
 			M_CHECK(sizeof(struct udphdr));
 			break;
 		case IPPROTO_SCTP:
 			M_CHECK(sizeof(struct sctphdr));
 			break;
 		}
 	} else if (ip != NULL) {
 		/*
 		 * Nothing to save except upper layer proto,
 		 * since this is a packet fragment.
 		 */
 #if defined(INET) || defined(INET6)
 		flags |= NG_NETFLOW_IS_FRAG;
 		upper_proto = ip->ip_p;
 #endif
 		if ((ip->ip_v != IPVERSION) ||
 		    ((ip->ip_hl << 2) < sizeof(struct ip)))
 			goto bypass;
 #ifdef INET6
 	} else if (ip6 != NULL) {
 		int cur = ip6->ip6_nxt, hdr_off = 0;
 		struct ip6_ext *ip6e;
 		struct ip6_frag *ip6f;
 
 		if (priv->export9 == NULL)
 			goto bypass;
 
 		/* Save upper layer info. */
 		off = pullup_len;
 		upper_proto = cur;
 
 		if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION)
 			goto bypass;
 
 		/*
 		 * Loop through IPv6 extended headers to get upper
 		 * layer header / frag.
 		 */
 		for (;;) {
 			switch (cur) {
 			/*
 			 * Same as in IPv4, we can forward a 'bad'
 			 * packet without accounting.
 			 */
 			case IPPROTO_TCP:
 				M_CHECK(sizeof(struct tcphdr));
 				goto loopend;
 			case IPPROTO_UDP:
 				M_CHECK(sizeof(struct udphdr));
 				goto loopend;
 			case IPPROTO_SCTP:
 				M_CHECK(sizeof(struct sctphdr));
 				goto loopend;
 
 			/* Loop until 'real' upper layer headers */
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_ROUTING:
 			case IPPROTO_DSTOPTS:
 				M_CHECK(sizeof(struct ip6_ext));
 				ip6e = (struct ip6_ext *)(mtod(m, caddr_t) +
 				    off);
 				upper_proto = ip6e->ip6e_nxt;
 				hdr_off = (ip6e->ip6e_len + 1) << 3;
 				break;
 
 			/* RFC4302, can be before DSTOPTS */
 			case IPPROTO_AH:
 				M_CHECK(sizeof(struct ip6_ext));
 				ip6e = (struct ip6_ext *)(mtod(m, caddr_t) +
 				    off);
 				upper_proto = ip6e->ip6e_nxt;
 				hdr_off = (ip6e->ip6e_len + 2) << 2;
 				break;
 
 			case IPPROTO_FRAGMENT:
 				M_CHECK(sizeof(struct ip6_frag));
 				ip6f = (struct ip6_frag *)(mtod(m, caddr_t) +
 				    off);
 				upper_proto = ip6f->ip6f_nxt;
 				hdr_off = sizeof(struct ip6_frag);
 				off += hdr_off;
 				flags |= NG_NETFLOW_IS_FRAG;
 				goto loopend;
 
 #if 0				
 			case IPPROTO_NONE:
 				goto loopend;
 #endif
 			/*
 			 * Any unknown header (new extension or IPv6/IPv4
 			 * header for tunnels) ends loop.
 			 */
 			default:
 				goto loopend;
 			}
 
 			off += hdr_off;
 			cur = upper_proto;
 		}
 #endif
 	}
 #undef	M_CHECK
 
 #ifdef INET6
 loopend:
 #endif
 	/* Just in case of real reallocation in M_CHECK() / m_pullup() */
 	if (m != m_old) {
 		priv->nfinfo_realloc_mbuf++;
 		/* Restore ip/ipv6 pointer */
 		if (ip != NULL)
 			ip = (struct ip *)(mtod(m, caddr_t) + l3_off);
 		else if (ip6 != NULL)
 			ip6 = (struct ip6_hdr *)(mtod(m, caddr_t) + l3_off);
  	}
 
 #if defined(INET) || defined(INET6)
 	upper_ptr = (caddr_t)(mtod(m, caddr_t) + off);
 
 	/* Determine packet input interface. Prefer configured. */
 	src_if_index = 0;
 	if (hook == iface->out || iface->info.ifinfo_index == 0) {
 		if (m->m_pkthdr.rcvif != NULL)
 			src_if_index = m->m_pkthdr.rcvif->if_index;
 	} else
 		src_if_index = iface->info.ifinfo_index;
 #endif
 
 	/* Check packet FIB */
 	fib = M_GETFIB(m);
 	if (fib >= priv->maxfibs) {
 		CTR2(KTR_NET, "ng_netflow_rcvdata(): packet fib %d is out of "
 		    "range of available fibs: 0 .. %d",
 		    fib, priv->maxfibs);
 		goto bypass;
 	}
 
 	if ((fe = priv_to_fib(priv, fib)) == NULL) {
 		/* Setup new FIB */
 		if (ng_netflow_fib_init(priv, fib) != 0) {
 			/* malloc() failed */
 			goto bypass;
 		}
 
 		fe = priv_to_fib(priv, fib);
 	}
 
 #ifdef INET
 	if (ip != NULL)
 		error = ng_netflow_flow_add(priv, fe, ip, upper_ptr,
 		    upper_proto, flags, src_if_index);
 #endif
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET6
 	if (ip6 != NULL)
 		error = ng_netflow_flow6_add(priv, fe, ip6, upper_ptr,
 		    upper_proto, flags, src_if_index);
 #endif
 	else
 		goto bypass;
 
 	acct = 1;
 bypass:
 	if (out != NULL) {
 		if (acct == 0) {
 			/* Accounting failure */
 			if (ip != NULL) {
 				counter_u64_add(priv->nfinfo_spackets, 1);
 				counter_u64_add(priv->nfinfo_sbytes,
 				    m->m_pkthdr.len);
 			} else if (ip6 != NULL) {
 				counter_u64_add(priv->nfinfo_spackets6, 1);
 				counter_u64_add(priv->nfinfo_sbytes6,
 				    m->m_pkthdr.len);
 			}
 		}
 
 		/* XXX: error gets overwritten here */
 		NG_FWD_NEW_DATA(error, item, out, m);
 		return (error);
 	}
 done:
 	if (item)
 		NG_FREE_ITEM(item);
 	if (m)
 		NG_FREE_M(m);
 
 	return (error);	
 }
 
 /* We will be shut down in a moment */
 static int
 ng_netflow_close(node_p node)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	callout_drain(&priv->exp_callout);
 	ng_netflow_cache_flush(priv);
 
 	return (0);
 }
 
 /* Do local shutdown processing. */
 static int
 ng_netflow_rmnode(node_p node)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	NG_NODE_SET_PRIVATE(node, NULL);
 	NG_NODE_UNREF(priv->node);
 
 	free(priv->fib_data, M_NETGRAPH);
 	free(priv, M_NETGRAPH);
 
 	return (0);
 }
 
 /* Hook disconnection. */
 static int
 ng_netflow_disconnect(hook_p hook)
 {
 	node_p node = NG_HOOK_NODE(hook);
 	priv_p priv = NG_NODE_PRIVATE(node);
 	iface_p iface = NG_HOOK_PRIVATE(hook);
 
 	if (iface != NULL) {
 		if (iface->hook == hook)
 			iface->hook = NULL;
 		if (iface->out == hook)
 			iface->out = NULL;
 	}
 
 	/* if export hook disconnected stop running expire(). */
 	if (hook == priv->export) {
 		if (priv->export9 == NULL)
 			callout_drain(&priv->exp_callout);
 		priv->export = NULL;
 	}
 
 	if (hook == priv->export9) {
 		if (priv->export == NULL)
 			callout_drain(&priv->exp_callout);
 		priv->export9 = NULL;
 	}
 
 	/* Removal of the last link destroys the node. */
 	if (NG_NODE_NUMHOOKS(node) == 0)
 		ng_rmnode_self(node);
 
 	return (0);
 }
diff --git a/sys/netgraph/ng_eiface.c b/sys/netgraph/ng_eiface.c
index 5b12ecc70c11..1f4699d71fcb 100644
--- a/sys/netgraph/ng_eiface.c
+++ b/sys/netgraph/ng_eiface.c
@@ -1,685 +1,686 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  *
  * Copyright (c) 1999-2001, Vitaly V Belekhov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/sockio.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/ng_parse.h>
 #include <netgraph/ng_eiface.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if_arp.h>
 
 static const struct ng_cmdlist ng_eiface_cmdlist[] = {
 	{
 	  NGM_EIFACE_COOKIE,
 	  NGM_EIFACE_GET_IFNAME,
 	  "getifname",
 	  NULL,
 	  &ng_parse_string_type
 	},
 	{
 	  NGM_EIFACE_COOKIE,
 	  NGM_EIFACE_SET,
 	  "set",
 	  &ng_parse_enaddr_type,
 	  NULL
 	},
 	{ 0 }
 };
 
 /* Node private data */
 struct ng_eiface_private {
 	struct ifnet	*ifp;		/* per-interface network data */
 	struct ifmedia	media;		/* (fake) media information */
 	int		link_status;	/* fake */
 	int		unit;		/* Interface unit number */
 	node_p		node;		/* Our netgraph node */
 	hook_p		ether;		/* Hook for ethernet stream */
 };
 typedef struct ng_eiface_private *priv_p;
 
 /* Interface methods */
 static void	ng_eiface_init(void *xsc);
 static void	ng_eiface_start(struct ifnet *ifp);
 static int	ng_eiface_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
 #ifdef DEBUG
 static void	ng_eiface_print_ioctl(struct ifnet *ifp, int cmd, caddr_t data);
 #endif
 
 /* Netgraph methods */
 static int		ng_eiface_mod_event(module_t, int, void *);
 static ng_constructor_t	ng_eiface_constructor;
 static ng_rcvmsg_t	ng_eiface_rcvmsg;
 static ng_shutdown_t	ng_eiface_rmnode;
 static ng_newhook_t	ng_eiface_newhook;
 static ng_rcvdata_t	ng_eiface_rcvdata;
 static ng_disconnect_t	ng_eiface_disconnect;
 
 /* Node type descriptor */
 static struct ng_type typestruct = {
 	.version =	NG_ABI_VERSION,
 	.name =		NG_EIFACE_NODE_TYPE,
 	.mod_event =	ng_eiface_mod_event,
 	.constructor =	ng_eiface_constructor,
 	.rcvmsg =	ng_eiface_rcvmsg,
 	.shutdown =	ng_eiface_rmnode,
 	.newhook =	ng_eiface_newhook,
 	.rcvdata =	ng_eiface_rcvdata,
 	.disconnect =	ng_eiface_disconnect,
 	.cmdlist =	ng_eiface_cmdlist
 };
 NETGRAPH_INIT(eiface, &typestruct);
 
 VNET_DEFINE_STATIC(struct unrhdr *, ng_eiface_unit);
 #define	V_ng_eiface_unit		VNET(ng_eiface_unit)
 
 /************************************************************************
 			INTERFACE STUFF
  ************************************************************************/
 
 /*
  * Process an ioctl for the virtual interface
  */
 static int
 ng_eiface_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	const priv_p priv = (priv_p)ifp->if_softc;
 	struct ifreq *const ifr = (struct ifreq *)data;
 	int error = 0;
 
 #ifdef DEBUG
 	ng_eiface_print_ioctl(ifp, command, data);
 #endif
 	switch (command) {
 	/* These two are mostly handled at a higher layer */
 	case SIOCSIFADDR:
 		error = ether_ioctl(ifp, command, data);
 		break;
 	case SIOCGIFADDR:
 		break;
 
 	/* Set flags */
 	case SIOCSIFFLAGS:
 		/*
 		 * If the interface is marked up and stopped, then start it.
 		 * If it is marked down and running, then stop it.
 		 */
 		if (ifp->if_flags & IFF_UP) {
 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 				ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE);
 				ifp->if_drv_flags |= IFF_DRV_RUNNING;
 			}
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				ifp->if_drv_flags &= ~(IFF_DRV_RUNNING |
 				    IFF_DRV_OACTIVE);
 		}
 		break;
 
 	/* Set the interface MTU */
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu > NG_EIFACE_MTU_MAX ||
 		    ifr->ifr_mtu < NG_EIFACE_MTU_MIN)
 			error = EINVAL;
 		else
 			ifp->if_mtu = ifr->ifr_mtu;
 		break;
 
 	/* (Fake) media type manipulation */
 	case SIOCSIFMEDIA:
 	case SIOCGIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &priv->media, command);
 		break;
 
 	/* Stuff that's not supported */
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		error = 0;
 		break;
 	case SIOCSIFPHYS:
 		error = EOPNOTSUPP;
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static void
 ng_eiface_init(void *xsc)
 {
 	priv_p sc = xsc;
 	struct ifnet *ifp = sc->ifp;
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 }
 
 /*
  * We simply relay the packet to the "ether" hook, if it is connected.
  * We have been through the netgraph locking and are guaranteed to
  * be the only code running in this node at this time.
  */
 static void
 ng_eiface_start2(node_p node, hook_p hook, void *arg1, int arg2)
 {
 	struct ifnet *ifp = arg1;
 	const priv_p priv = (priv_p)ifp->if_softc;
 	int error = 0;
 	struct mbuf *m;
 
 	/* Check interface flags */
 
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING)))
 		return;
 
 	for (;;) {
 		/*
 		 * Grab a packet to transmit.
 		 */
 		IF_DEQUEUE(&ifp->if_snd, m);
 
 		/* If there's nothing to send, break. */
 		if (m == NULL)
 			break;
 
 		/* Peel the mbuf off any stale tags */
 		m_tag_delete_chain(m, NULL);
 
 		/*
 		 * Berkeley packet filter.
 		 * Pass packet to bpf if there is a listener.
 		 * XXX is this safe? locking?
 		 */
 		BPF_MTAP(ifp, m);
 
 		if (ifp->if_flags & IFF_MONITOR) {
 			if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 			m_freem(m);
 			continue;
 		}
 
 		/*
 		 * Send packet; if hook is not connected, mbuf will get
 		 * freed.
 		 */
 		NG_OUTBOUND_THREAD_REF();
 		NG_SEND_DATA_ONLY(error, priv->ether, m);
 		NG_OUTBOUND_THREAD_UNREF();
 
 		/* Update stats */
 		if (error == 0)
 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 		else
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	}
 
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 
 	return;
 }
 
 /*
  * This routine is called to deliver a packet out the interface.
  * We simply queue the netgraph version to be called when netgraph locking
  * allows it to happen.
  * Until we know what the rest of the networking code is doing for
  * locking, we don't know how we will interact with it.
  * Take comfort from the fact that the ifnet struct is part of our
  * private info and can't go away while we are queued.
  * [Though we don't know it is still there now....]
  * it is possible we don't gain anything from this because
  * we would like to get the mbuf and queue it as data
  * somehow, but we can't and if we did would we solve anything?
  */
 static void
 ng_eiface_start(struct ifnet *ifp)
 {
 	const priv_p priv = (priv_p)ifp->if_softc;
 
 	/* Don't do anything if output is active */
 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE)
 		return;
 
 	ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 
 	if (ng_send_fn(priv->node, NULL, &ng_eiface_start2, ifp, 0) != 0)
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 }
 
 #ifdef DEBUG
 /*
  * Display an ioctl to the virtual interface
  */
 
 static void
 ng_eiface_print_ioctl(struct ifnet *ifp, int command, caddr_t data)
 {
 	char *str;
 
 	switch (command & IOC_DIRMASK) {
 	case IOC_VOID:
 		str = "IO";
 		break;
 	case IOC_OUT:
 		str = "IOR";
 		break;
 	case IOC_IN:
 		str = "IOW";
 		break;
 	case IOC_INOUT:
 		str = "IORW";
 		break;
 	default:
 		str = "IO??";
 	}
 	log(LOG_DEBUG, "%s: %s('%c', %d, char[%d])\n",
 	    ifp->if_xname,
 	    str,
 	    IOCGROUP(command),
 	    command & 0xff,
 	    IOCPARM_LEN(command));
 }
 #endif /* DEBUG */
 
 /*
  * ifmedia stuff
  */
 static int
 ng_eiface_mediachange(struct ifnet *ifp)
 {
 	const priv_p priv = (priv_p)ifp->if_softc;
 	struct ifmedia *ifm = &priv->media;
 
 	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
 		return (EINVAL);
 	if (IFM_SUBTYPE(ifm->ifm_media) == IFM_AUTO)
 		ifp->if_baudrate = ifmedia_baudrate(IFM_ETHER | IFM_1000_T);
 	else
 		ifp->if_baudrate = ifmedia_baudrate(ifm->ifm_media);
 
 	return (0);
 }
 
 static void
 ng_eiface_mediastatus(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 	const priv_p priv = (priv_p)ifp->if_softc;
 	struct ifmedia *ifm = &priv->media;
 
 	if (ifm->ifm_cur->ifm_media == (IFM_ETHER | IFM_AUTO) &&
 	    (priv->link_status & IFM_ACTIVE))
 		ifmr->ifm_active = IFM_ETHER | IFM_1000_T | IFM_FDX;
 	else
 		ifmr->ifm_active = ifm->ifm_cur->ifm_media;
 	ifmr->ifm_status = priv->link_status;
 
 	return;
 }
 
 /************************************************************************
 			NETGRAPH NODE STUFF
  ************************************************************************/
 
 /*
  * Constructor for a node
  */
 static int
 ng_eiface_constructor(node_p node)
 {
 	struct ifnet *ifp;
 	priv_p priv;
 	struct ether_addr eaddr;
 
 	/* Allocate node and interface private structures */
 	priv = malloc(sizeof(*priv), M_NETGRAPH, M_WAITOK | M_ZERO);
 
 	ifp = priv->ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		free(priv, M_NETGRAPH);
 		return (ENOSPC);
 	}
 
 	/* Link them together */
 	ifp->if_softc = priv;
 
 	/* Get an interface unit number */
 	priv->unit = alloc_unr(V_ng_eiface_unit);
 
 	/* Link together node and private info */
 	NG_NODE_SET_PRIVATE(node, priv);
 	priv->node = node;
 
 	/* Initialize interface structure */
 	if_initname(ifp, NG_EIFACE_EIFACE_NAME, priv->unit);
 	ifp->if_init = ng_eiface_init;
 	ifp->if_output = ether_output;
 	ifp->if_start = ng_eiface_start;
 	ifp->if_ioctl = ng_eiface_ioctl;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	ifp->if_flags = (IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST);
 	ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_JUMBO_MTU;
 	ifp->if_capenable = IFCAP_VLAN_MTU | IFCAP_JUMBO_MTU;
 	ifmedia_init(&priv->media, 0, ng_eiface_mediachange,
 	    ng_eiface_mediastatus);
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_10_T, 0, NULL);
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_10_T | IFM_FDX, 0, NULL);
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_100_TX, 0, NULL);
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_100_TX | IFM_FDX, 0, NULL);
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_1000_T, 0, NULL);
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_10G_T | IFM_FDX, 0, NULL);
 	ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL);
 	ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO);
 	priv->link_status = IFM_AVALID;
 
 	/* Give this node the same name as the interface (if possible) */
 	if (ng_name_node(node, ifp->if_xname) != 0)
 		log(LOG_WARNING, "%s: can't acquire netgraph name\n",
 		    ifp->if_xname);
 
 	/* Attach the interface */
 	ether_gen_addr(ifp, &eaddr);
 	ether_ifattach(ifp, eaddr.octet);
 	ifp->if_baudrate = ifmedia_baudrate(IFM_ETHER | IFM_1000_T);
 
 	/* Done */
 	return (0);
 }
 
 /*
  * Give our ok for a hook to be added
  */
 static int
 ng_eiface_newhook(node_p node, hook_p hook, const char *name)
 {
 	priv_p priv = NG_NODE_PRIVATE(node);
 	struct ifnet *ifp = priv->ifp;
 
 	if (strcmp(name, NG_EIFACE_HOOK_ETHER))
 		return (EPFNOSUPPORT);
 	if (priv->ether != NULL)
 		return (EISCONN);
 	priv->ether = hook;
 	NG_HOOK_SET_PRIVATE(hook, &priv->ether);
 	NG_HOOK_SET_TO_INBOUND(hook);
 
 	priv->link_status |= IFM_ACTIVE;
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	if_link_state_change(ifp, LINK_STATE_UP);
 	CURVNET_RESTORE();
 
 	return (0);
 }
 
 /*
  * Receive a control message
  */
 static int
 ng_eiface_rcvmsg(node_p node, item_p item, hook_p lasthook)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ifnet *const ifp = priv->ifp;
 	struct ng_mesg *resp = NULL;
 	int error = 0;
 	struct ng_mesg *msg;
 
 	NGI_GET_MSG(item, msg);
 	switch (msg->header.typecookie) {
 	case NGM_EIFACE_COOKIE:
 		switch (msg->header.cmd) {
 		case NGM_EIFACE_SET:
 		    {
 			if (msg->header.arglen != ETHER_ADDR_LEN) {
 				error = EINVAL;
 				break;
 			}
 			error = if_setlladdr(priv->ifp,
 			    (u_char *)msg->data, ETHER_ADDR_LEN);
 			break;
 		    }
 
 		case NGM_EIFACE_GET_IFNAME:
 			NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			strlcpy(resp->data, ifp->if_xname, IFNAMSIZ);
 			break;
 
 		case NGM_EIFACE_GET_IFADDRS:
 		    {
 			struct epoch_tracker et;
 			struct ifaddr *ifa;
 			caddr_t ptr;
 			int buflen;
 
 			/* Determine size of response and allocate it */
 			buflen = 0;
 			NET_EPOCH_ENTER(et);
 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 				buflen += SA_SIZE(ifa->ifa_addr);
 			NG_MKRESPONSE(resp, msg, buflen, M_NOWAIT);
 			if (resp == NULL) {
 				NET_EPOCH_EXIT(et);
 				error = ENOMEM;
 				break;
 			}
 
 			/* Add addresses */
 			ptr = resp->data;
 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				const int len = SA_SIZE(ifa->ifa_addr);
 
 				if (buflen < len) {
 					log(LOG_ERR, "%s: len changed?\n",
 					    ifp->if_xname);
 					break;
 				}
 				bcopy(ifa->ifa_addr, ptr, len);
 				ptr += len;
 				buflen -= len;
 			}
 			NET_EPOCH_EXIT(et);
 			break;
 		    }
 
 		default:
 			error = EINVAL;
 			break;
 		} /* end of inner switch() */
 		break;
 	case NGM_FLOW_COOKIE:
 		CURVNET_SET_QUIET(ifp->if_vnet);
 		switch (msg->header.cmd) {
 		case NGM_LINK_IS_UP:
 			priv->link_status |= IFM_ACTIVE;
 			if_link_state_change(ifp, LINK_STATE_UP);
 			break;
 		case NGM_LINK_IS_DOWN:
 			priv->link_status &= ~IFM_ACTIVE;
 			if_link_state_change(ifp, LINK_STATE_DOWN);
 			break;
 		default:
 			break;
 		}
 		CURVNET_RESTORE();
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	NG_RESPOND_MSG(error, node, item, resp);
 	NG_FREE_MSG(msg);
 	return (error);
 }
 
 /*
  * Receive data from a hook. Pass the packet to the ether_input routine.
  */
 static int
 ng_eiface_rcvdata(hook_p hook, item_p item)
 {
 	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 	struct ifnet *const ifp = priv->ifp;
 	struct mbuf *m;
 
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
 		NG_FREE_M(m);
 		return (ENETDOWN);
 	}
 
 	if (m->m_len < ETHER_HDR_LEN) {
 		m = m_pullup(m, ETHER_HDR_LEN);
 		if (m == NULL)
 			return (EINVAL);
 	}
 
 	/* Note receiving interface */
 	m->m_pkthdr.rcvif = ifp;
 
 	/* Update interface stats */
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 
 	(*ifp->if_input)(ifp, m);
 
 	/* Done */
 	return (0);
 }
 
 /*
  * Shutdown processing.
  */
 static int
 ng_eiface_rmnode(node_p node)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ifnet *const ifp = priv->ifp;
 
 	/*
 	 * the ifnet may be in a different vnet than the netgraph node, 
 	 * hence we have to change the current vnet context here.
 	 */
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	ether_ifdetach(ifp);
 	ifmedia_removeall(&priv->media);
 	if_free(ifp);
 	CURVNET_RESTORE();
 	free_unr(V_ng_eiface_unit, priv->unit);
 	free(priv, M_NETGRAPH);
 	NG_NODE_SET_PRIVATE(node, NULL);
 	NG_NODE_UNREF(node);
 	return (0);
 }
 
 /*
  * Hook disconnection
  */
 static int
 ng_eiface_disconnect(hook_p hook)
 {
 	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 
 	priv->ether = NULL;
 	priv->link_status &= ~IFM_ACTIVE;
 	CURVNET_SET_QUIET(priv->ifp->if_vnet);
 	if_link_state_change(priv->ifp, LINK_STATE_DOWN);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * Handle loading and unloading for this node type.
  */
 static int
 ng_eiface_mod_event(module_t mod, int event, void *data)
 {
 	int error = 0;
 
 	switch (event) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static void
 vnet_ng_eiface_init(const void *unused)
 {
 
 	V_ng_eiface_unit = new_unrhdr(0, 0xffff, NULL);
 }
 VNET_SYSINIT(vnet_ng_eiface_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_ng_eiface_init, NULL);
 
 static void
 vnet_ng_eiface_uninit(const void *unused)
 {
 
 	delete_unrhdr(V_ng_eiface_unit);
 }
 VNET_SYSUNINIT(vnet_ng_eiface_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
    vnet_ng_eiface_uninit, NULL);
diff --git a/sys/netgraph/ng_ether.c b/sys/netgraph/ng_ether.c
index 40e06604b8bb..b3f1e8762b3d 100644
--- a/sys/netgraph/ng_ether.c
+++ b/sys/netgraph/ng_ether.c
@@ -1,881 +1,882 @@
 
 /*
  * ng_ether.c
  */
 
 /*-
  * Copyright (c) 1996-2000 Whistle Communications, Inc.
  * All rights reserved.
  * 
  * Subject to the following obligations and disclaimer of warranty, use and
  * redistribution of this software, in source or object code forms, with or
  * without modifications are expressly permitted by Whistle Communications;
  * provided, however, that:
  * 1. Any and all reproductions of the source or object code must include the
  *    copyright notice above and the following disclaimer of warranties; and
  * 2. No rights are granted, in any manner or form, to use Whistle
  *    Communications, Inc. trademarks, including the mark "WHISTLE
  *    COMMUNICATIONS" on advertising, endorsements, or otherwise except as
  *    such appears in the above copyright notice or in the software.
  * 
  * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
  * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
  * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
  * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
  * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
  * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
  * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
  * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
  * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
  * OF SUCH DAMAGE.
  *
  * Authors: Archie Cobbs <archie@freebsd.org>
  *	    Julian Elischer <julian@freebsd.org>
  *
  * $FreeBSD$
  */
 
 /*
  * ng_ether(4) netgraph node type
  */
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/syslog.h>
 #include <sys/socket.h>
 #include <sys/taskqueue.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_arp.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/ethernet.h>
 #include <net/if_bridgevar.h>
 #include <net/vnet.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/ng_parse.h>
 #include <netgraph/ng_ether.h>
 
 MODULE_VERSION(ng_ether, 1);
 
 #define IFP2NG(ifp)  ((ifp)->if_l2com)
 
 /* Per-node private data */
 struct private {
 	struct ifnet	*ifp;		/* associated interface */
 	hook_p		upper;		/* upper hook connection */
 	hook_p		lower;		/* lower hook connection */
 	hook_p		orphan;		/* orphan hook connection */
 	u_char		autoSrcAddr;	/* always overwrite source address */
 	u_char		promisc;	/* promiscuous mode enabled */
 	u_long		hwassist;	/* hardware checksum capabilities */
 	u_int		flags;		/* flags e.g. really die */
 };
 typedef struct private *priv_p;
 
 /* Hook pointers used by if_ethersubr.c to callback to netgraph */
 extern	void	(*ng_ether_input_p)(struct ifnet *ifp, struct mbuf **mp);
 extern	void	(*ng_ether_input_orphan_p)(struct ifnet *ifp, struct mbuf *m);
 extern	int	(*ng_ether_output_p)(struct ifnet *ifp, struct mbuf **mp);
 extern	void	(*ng_ether_attach_p)(struct ifnet *ifp);
 extern	void	(*ng_ether_detach_p)(struct ifnet *ifp);
 extern	void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 
 /* Functional hooks called from if_ethersubr.c */
 static void	ng_ether_input(struct ifnet *ifp, struct mbuf **mp);
 static void	ng_ether_input_orphan(struct ifnet *ifp, struct mbuf *m);
 static int	ng_ether_output(struct ifnet *ifp, struct mbuf **mp);
 static void	ng_ether_attach(struct ifnet *ifp);
 static void	ng_ether_detach(struct ifnet *ifp); 
 static void	ng_ether_link_state(struct ifnet *ifp, int state); 
 
 /* Other functions */
 static int	ng_ether_rcv_lower(hook_p node, item_p item);
 static int	ng_ether_rcv_upper(hook_p node, item_p item);
 
 /* Netgraph node methods */
 static ng_constructor_t	ng_ether_constructor;
 static ng_rcvmsg_t	ng_ether_rcvmsg;
 static ng_shutdown_t	ng_ether_shutdown;
 static ng_newhook_t	ng_ether_newhook;
 static ng_rcvdata_t	ng_ether_rcvdata;
 static ng_disconnect_t	ng_ether_disconnect;
 static int		ng_ether_mod_event(module_t mod, int event, void *data);
 
 static eventhandler_tag	ng_ether_ifnet_arrival_cookie;
 
 /* List of commands and how to convert arguments to/from ASCII */
 static const struct ng_cmdlist ng_ether_cmdlist[] = {
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_GET_IFNAME,
 	  "getifname",
 	  NULL,
 	  &ng_parse_string_type
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_GET_IFINDEX,
 	  "getifindex",
 	  NULL,
 	  &ng_parse_int32_type
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_GET_ENADDR,
 	  "getenaddr",
 	  NULL,
 	  &ng_parse_enaddr_type
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_SET_ENADDR,
 	  "setenaddr",
 	  &ng_parse_enaddr_type,
 	  NULL
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_GET_PROMISC,
 	  "getpromisc",
 	  NULL,
 	  &ng_parse_int32_type
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_SET_PROMISC,
 	  "setpromisc",
 	  &ng_parse_int32_type,
 	  NULL
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_GET_AUTOSRC,
 	  "getautosrc",
 	  NULL,
 	  &ng_parse_int32_type
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_SET_AUTOSRC,
 	  "setautosrc",
 	  &ng_parse_int32_type,
 	  NULL
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_ADD_MULTI,
 	  "addmulti",
 	  &ng_parse_enaddr_type,
 	  NULL
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_DEL_MULTI,
 	  "delmulti",
 	  &ng_parse_enaddr_type,
 	  NULL
 	},
 	{
 	  NGM_ETHER_COOKIE,
 	  NGM_ETHER_DETACH,
 	  "detach",
 	  NULL,
 	  NULL
 	},
 	{ 0 }
 };
 
 static struct ng_type ng_ether_typestruct = {
 	.version =	NG_ABI_VERSION,
 	.name =		NG_ETHER_NODE_TYPE,
 	.mod_event =	ng_ether_mod_event,
 	.constructor =	ng_ether_constructor,
 	.rcvmsg =	ng_ether_rcvmsg,
 	.shutdown =	ng_ether_shutdown,
 	.newhook =	ng_ether_newhook,
 	.rcvdata =	ng_ether_rcvdata,
 	.disconnect =	ng_ether_disconnect,
 	.cmdlist =	ng_ether_cmdlist,
 };
 NETGRAPH_INIT(ether, &ng_ether_typestruct);
 
 /******************************************************************
 		    UTILITY FUNCTIONS
 ******************************************************************/
 static void
 ng_ether_sanitize_ifname(const char *ifname, char *name)
 {
 	int i;
 
 	for (i = 0; i < IFNAMSIZ; i++) {
 		if (ifname[i] == '.' || ifname[i] == ':')
 			name[i] = '_';
 		else
 			name[i] = ifname[i];
 		if (name[i] == '\0')
 			break;
 	}
 }
 
 /******************************************************************
 		    ETHERNET FUNCTION HOOKS
 ******************************************************************/
 
 /*
  * Handle a packet that has come in on an interface. We get to
  * look at it here before any upper layer protocols do.
  */
 static void
 ng_ether_input(struct ifnet *ifp, struct mbuf **mp)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	int error;
 
 	/* If "lower" hook not connected, let packet continue */
 	if (priv->lower == NULL)
 		return;
 	NG_SEND_DATA_ONLY(error, priv->lower, *mp);	/* sets *mp = NULL */
 }
 
 /*
  * Handle a packet that has come in on an interface, and which
  * does not match any of our known protocols (an ``orphan'').
  */
 static void
 ng_ether_input_orphan(struct ifnet *ifp, struct mbuf *m)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	int error;
 
 	/* If "orphan" hook not connected, discard packet */
 	if (priv->orphan == NULL) {
 		m_freem(m);
 		return;
 	}
 	NG_SEND_DATA_ONLY(error, priv->orphan, m);
 }
 
 /*
  * Handle a packet that is going out on an interface.
  * The Ethernet header is already attached to the mbuf.
  */
 static int
 ng_ether_output(struct ifnet *ifp, struct mbuf **mp)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	int error = 0;
 
 	/* If "upper" hook not connected, let packet continue */
 	if (priv->upper == NULL)
 		return (0);
 
 	/* Send it out "upper" hook */
 	NG_OUTBOUND_THREAD_REF();
 	NG_SEND_DATA_ONLY(error, priv->upper, *mp);
 	NG_OUTBOUND_THREAD_UNREF();
 	return (error);
 }
 
 /*
  * A new Ethernet interface has been attached.
  * Create a new node for it, etc.
  */
 static void
 ng_ether_attach(struct ifnet *ifp)
 {
 	char name[IFNAMSIZ];
 	priv_p priv;
 	node_p node;
 
 	/*
 	 * Do not create / attach an ether node to this ifnet if
 	 * a netgraph node with the same name already exists.
 	 * This should prevent ether nodes to become attached to
 	 * eiface nodes, which may be problematic due to naming
 	 * clashes.
 	 */
 	ng_ether_sanitize_ifname(ifp->if_xname, name);
 	if ((node = ng_name2noderef(NULL, name)) != NULL) {
 		NG_NODE_UNREF(node);
 		return;
 	}
 
 	/* Create node */
 	KASSERT(!IFP2NG(ifp), ("%s: node already exists?", __func__));
 	if (ng_make_node_common(&ng_ether_typestruct, &node) != 0) {
 		log(LOG_ERR, "%s: can't %s for %s\n",
 		    __func__, "create node", ifp->if_xname);
 		return;
 	}
 
 	/* Allocate private data */
 	priv = malloc(sizeof(*priv), M_NETGRAPH, M_NOWAIT | M_ZERO);
 	if (priv == NULL) {
 		log(LOG_ERR, "%s: can't %s for %s\n",
 		    __func__, "allocate memory", ifp->if_xname);
 		NG_NODE_UNREF(node);
 		return;
 	}
 	NG_NODE_SET_PRIVATE(node, priv);
 	priv->ifp = ifp;
 	IFP2NG(ifp) = node;
 	priv->hwassist = ifp->if_hwassist;
 
 	/* Try to give the node the same name as the interface */
 	if (ng_name_node(node, name) != 0)
 		log(LOG_WARNING, "%s: can't name node %s\n", __func__, name);
 }
 
 /*
  * An Ethernet interface is being detached.
  * REALLY Destroy its node.
  */
 static void
 ng_ether_detach(struct ifnet *ifp)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 	NG_NODE_REALLY_DIE(node);	/* Force real removal of node */
 	/*
 	 * We can't assume the ifnet is still around when we run shutdown
 	 * So zap it now. XXX We HOPE that anything running at this time
 	 * handles it (as it should in the non netgraph case).
 	 */
 	IFP2NG(ifp) = NULL;
 	priv->ifp = NULL;	/* XXX race if interrupted an output packet */
 	ng_rmnode_self(node);		/* remove all netgraph parts */
 }
 
 /*
  * Notify graph about link event.
  * if_link_state_change() has already checked that the state has changed.
  */
 static void
 ng_ether_link_state(struct ifnet *ifp, int state)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ng_mesg *msg;
 	int cmd, dummy_error = 0;
 
 	if (state == LINK_STATE_UP)
 		cmd = NGM_LINK_IS_UP;
 	else if (state == LINK_STATE_DOWN)
 		cmd = NGM_LINK_IS_DOWN;
 	else
 		return;
 
 	if (priv->lower != NULL) {
 		NG_MKMESSAGE(msg, NGM_FLOW_COOKIE, cmd, 0, M_NOWAIT);
 		if (msg != NULL)
 			NG_SEND_MSG_HOOK(dummy_error, node, msg, priv->lower, 0);
 	}
 	if (priv->orphan != NULL) {
 		NG_MKMESSAGE(msg, NGM_FLOW_COOKIE, cmd, 0, M_NOWAIT);
 		if (msg != NULL)
 			NG_SEND_MSG_HOOK(dummy_error, node, msg, priv->orphan, 0);
 	}
 }
 
 /*
  * Interface arrival notification handler.
  * The notification is produced in two cases:
  *  o a new interface arrives
  *  o an existing interface got renamed
  * Currently the first case is handled by ng_ether_attach via special
  * hook ng_ether_attach_p.
  */
 static void
 ng_ether_ifnet_arrival_event(void *arg __unused, struct ifnet *ifp)
 {
 	char name[IFNAMSIZ];
 	node_p node;
 
 	/* Only ethernet interfaces are of interest. */
 	if (ifp->if_type != IFT_ETHER &&
 	    ifp->if_type != IFT_L2VLAN &&
 	    ifp->if_type != IFT_BRIDGE)
 		return;
 
 	/*
 	 * Just return if it's a new interface without an ng_ether companion.
 	 */
 	node = IFP2NG(ifp);
 	if (node == NULL)
 		return;
 
 	/* Try to give the node the same name as the new interface name */
 	ng_ether_sanitize_ifname(ifp->if_xname, name);
 	if (ng_name_node(node, name) != 0)
 		log(LOG_WARNING, "%s: can't re-name node %s\n", __func__, name);
 }
 
 /******************************************************************
 		    NETGRAPH NODE METHODS
 ******************************************************************/
 
 /*
  * It is not possible or allowable to create a node of this type.
  * Nodes get created when the interface is attached (or, when
  * this node type's KLD is loaded).
  */
 static int
 ng_ether_constructor(node_p node)
 {
 	return (EINVAL);
 }
 
 /*
  * Check for attaching a new hook.
  */
 static	int
 ng_ether_newhook(node_p node, hook_p hook, const char *name)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	hook_p *hookptr;
 
 	/* Divert hook is an alias for lower */
 	if (strcmp(name, NG_ETHER_HOOK_DIVERT) == 0)
 		name = NG_ETHER_HOOK_LOWER;
 
 	/* Which hook? */
 	if (strcmp(name, NG_ETHER_HOOK_UPPER) == 0) {
 		hookptr = &priv->upper;
 		NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_upper);
 		NG_HOOK_SET_TO_INBOUND(hook);
 	} else if (strcmp(name, NG_ETHER_HOOK_LOWER) == 0) {
 		hookptr = &priv->lower;
 		NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_lower);
 	} else if (strcmp(name, NG_ETHER_HOOK_ORPHAN) == 0) {
 		hookptr = &priv->orphan;
 		NG_HOOK_SET_RCVDATA(hook, ng_ether_rcv_lower);
 	} else
 		return (EINVAL);
 
 	/* Check if already connected (shouldn't be, but doesn't hurt) */
 	if (*hookptr != NULL)
 		return (EISCONN);
 
 	/* Disable hardware checksums while 'upper' hook is connected */
 	if (hookptr == &priv->upper)
 		priv->ifp->if_hwassist = 0;
 	NG_HOOK_HI_STACK(hook);
 	/* OK */
 	*hookptr = hook;
 	return (0);
 }
 
 /*
  * Receive an incoming control message.
  */
 static int
 ng_ether_rcvmsg(node_p node, item_p item, hook_p lasthook)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ng_mesg *resp = NULL;
 	int error = 0;
 	struct ng_mesg *msg;
 
 	NGI_GET_MSG(item, msg);
 	switch (msg->header.typecookie) {
 	case NGM_ETHER_COOKIE:
 		switch (msg->header.cmd) {
 		case NGM_ETHER_GET_IFNAME:
 			NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			strlcpy(resp->data, priv->ifp->if_xname, IFNAMSIZ);
 			break;
 		case NGM_ETHER_GET_IFINDEX:
 			NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			*((u_int32_t *)resp->data) = priv->ifp->if_index;
 			break;
 		case NGM_ETHER_GET_ENADDR:
 			NG_MKRESPONSE(resp, msg, ETHER_ADDR_LEN, M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			bcopy(IF_LLADDR(priv->ifp),
 			    resp->data, ETHER_ADDR_LEN);
 			break;
 		case NGM_ETHER_SET_ENADDR:
 		    {
 			if (msg->header.arglen != ETHER_ADDR_LEN) {
 				error = EINVAL;
 				break;
 			}
 			error = if_setlladdr(priv->ifp,
 			    (u_char *)msg->data, ETHER_ADDR_LEN);
 			break;
 		    }
 		case NGM_ETHER_GET_PROMISC:
 			NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			*((u_int32_t *)resp->data) = priv->promisc;
 			break;
 		case NGM_ETHER_SET_PROMISC:
 		    {
 			u_char want;
 
 			if (msg->header.arglen != sizeof(u_int32_t)) {
 				error = EINVAL;
 				break;
 			}
 			want = !!*((u_int32_t *)msg->data);
 			if (want ^ priv->promisc) {
 				if ((error = ifpromisc(priv->ifp, want)) != 0)
 					break;
 				priv->promisc = want;
 			}
 			break;
 		    }
 		case NGM_ETHER_GET_AUTOSRC:
 			NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			*((u_int32_t *)resp->data) = priv->autoSrcAddr;
 			break;
 		case NGM_ETHER_SET_AUTOSRC:
 			if (msg->header.arglen != sizeof(u_int32_t)) {
 				error = EINVAL;
 				break;
 			}
 			priv->autoSrcAddr = !!*((u_int32_t *)msg->data);
 			break;
 		case NGM_ETHER_ADD_MULTI:
 		    {
 			struct sockaddr_dl sa_dl;
 			struct epoch_tracker et;
 			struct ifmultiaddr *ifma;
 
 			if (msg->header.arglen != ETHER_ADDR_LEN) {
 				error = EINVAL;
 				break;
 			}
 			bzero(&sa_dl, sizeof(struct sockaddr_dl));
 			sa_dl.sdl_len = sizeof(struct sockaddr_dl);
 			sa_dl.sdl_family = AF_LINK;
 			sa_dl.sdl_alen = ETHER_ADDR_LEN;
 			bcopy((void *)msg->data, LLADDR(&sa_dl),
 			    ETHER_ADDR_LEN);
 			/*
 			 * Netgraph is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			NET_EPOCH_ENTER(et);
 			ifma = if_findmulti(priv->ifp,
 			    (struct sockaddr *)&sa_dl);
 			NET_EPOCH_EXIT(et);
 			if (ifma != NULL) {
 				error = EADDRINUSE;
 			} else {
 				error = if_addmulti(priv->ifp,
 				    (struct sockaddr *)&sa_dl, &ifma);
 			}
 			break;
 		    }
 		case NGM_ETHER_DEL_MULTI:
 		    {
 			struct sockaddr_dl sa_dl;
 
 			if (msg->header.arglen != ETHER_ADDR_LEN) {
 				error = EINVAL;
 				break;
 			}
 			bzero(&sa_dl, sizeof(struct sockaddr_dl));
 			sa_dl.sdl_len = sizeof(struct sockaddr_dl);
 			sa_dl.sdl_family = AF_LINK;
 			sa_dl.sdl_alen = ETHER_ADDR_LEN;
 			bcopy((void *)msg->data, LLADDR(&sa_dl),
 			    ETHER_ADDR_LEN);
 			error = if_delmulti(priv->ifp,
 			    (struct sockaddr *)&sa_dl);
 			break;
 		    }
 		case NGM_ETHER_DETACH:
 			ng_ether_detach(priv->ifp);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	NG_RESPOND_MSG(error, node, item, resp);
 	NG_FREE_MSG(msg);
 	return (error);
 }
 
 /*
  * Receive data on a hook.
  * Since we use per-hook recveive methods this should never be called.
  */
 static int
 ng_ether_rcvdata(hook_p hook, item_p item)
 {
 	NG_FREE_ITEM(item);
 
 	panic("%s: weird hook", __func__);
 }
 
 /*
  * Handle an mbuf received on the "lower" or "orphan" hook.
  */
 static int
 ng_ether_rcv_lower(hook_p hook, item_p item)
 {
 	struct mbuf *m;
 	const node_p node = NG_HOOK_NODE(hook);
 	const priv_p priv = NG_NODE_PRIVATE(node);
  	struct ifnet *const ifp = priv->ifp;
 
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 
 	/* Check whether interface is ready for packets */
 
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
 		NG_FREE_M(m);
 		return (ENETDOWN);
 	}
 
 	/* Make sure header is fully pulled up */
 	if (m->m_pkthdr.len < sizeof(struct ether_header)) {
 		NG_FREE_M(m);
 		return (EINVAL);
 	}
 	if (m->m_len < sizeof(struct ether_header)
 	    && (m = m_pullup(m, sizeof(struct ether_header))) == NULL)
 		return (ENOBUFS);
 
 	/* Drop in the MAC address if desired */
 	if (priv->autoSrcAddr) {
 		/* Make the mbuf writable if it's not already */
 		if (!M_WRITABLE(m)
 		    && (m = m_pullup(m, sizeof(struct ether_header))) == NULL)
 			return (ENOBUFS);
 
 		/* Overwrite source MAC address */
 		bcopy(IF_LLADDR(ifp),
 		    mtod(m, struct ether_header *)->ether_shost,
 		    ETHER_ADDR_LEN);
 	}
 
 	/* Send it on its way */
 	return ether_output_frame(ifp, m);
 }
 
 /*
  * Handle an mbuf received on the "upper" hook.
  */
 static int
 ng_ether_rcv_upper(hook_p hook, item_p item)
 {
 	struct mbuf *m;
 	const node_p node = NG_HOOK_NODE(hook);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ifnet *ifp = priv->ifp;
 
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 
 	/* Check length and pull off header */
 	if (m->m_pkthdr.len < sizeof(struct ether_header)) {
 		NG_FREE_M(m);
 		return (EINVAL);
 	}
 	if (m->m_len < sizeof(struct ether_header) &&
 	    (m = m_pullup(m, sizeof(struct ether_header))) == NULL)
 		return (ENOBUFS);
 
 	m->m_pkthdr.rcvif = ifp;
 
 	/* Pass the packet to the bridge, it may come back to us */
 	if (ifp->if_bridge) {
 		BRIDGE_INPUT(ifp, m);
 		if (m == NULL)
 			return (0);
 	}
 
 	/* Route packet back in */
 	ether_demux(ifp, m);
 	return (0);
 }
 
 /*
  * Shutdown node. This resets the node but does not remove it
  * unless the REALLY_DIE flag is set.
  */
 static int
 ng_ether_shutdown(node_p node)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	if (node->nd_flags & NGF_REALLY_DIE) {
 		/*
 		 * The ifnet is going away, perhaps because the driver was
 		 * unloaded or its vnet is being torn down.
 		 */
 		NG_NODE_SET_PRIVATE(node, NULL);
 		if (priv->ifp != NULL)
 			IFP2NG(priv->ifp) = NULL;
 		free(priv, M_NETGRAPH);
 		NG_NODE_UNREF(node);	/* free node itself */
 		return (0);
 	}
 	if (priv->promisc) {		/* disable promiscuous mode */
 		(void)ifpromisc(priv->ifp, 0);
 		priv->promisc = 0;
 	}
 	NG_NODE_REVIVE(node);		/* Signal ng_rmnode we are persisant */
 
 	return (0);
 }
 
 /*
  * Hook disconnection.
  */
 static int
 ng_ether_disconnect(hook_p hook)
 {
 	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 
 	if (hook == priv->upper) {
 		priv->upper = NULL;
 		if (priv->ifp != NULL)		/* restore h/w csum */
 			priv->ifp->if_hwassist = priv->hwassist;
 	} else if (hook == priv->lower)
 		priv->lower = NULL;
 	else if (hook == priv->orphan)
 		priv->orphan = NULL;
 	else
 		panic("%s: weird hook", __func__);
 	if ((NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0)
 	&& (NG_NODE_IS_VALID(NG_HOOK_NODE(hook))))
 		ng_rmnode_self(NG_HOOK_NODE(hook));	/* reset node */
 	return (0);
 }
 
 /******************************************************************
 		    	INITIALIZATION
 ******************************************************************/
 
 /*
  * Handle loading and unloading for this node type.
  */
 static int
 ng_ether_mod_event(module_t mod, int event, void *data)
 {
 	int error = 0;
 
 	switch (event) {
 	case MOD_LOAD:
 
 		/* Register function hooks */
 		if (ng_ether_attach_p != NULL) {
 			error = EEXIST;
 			break;
 		}
 		ng_ether_attach_p = ng_ether_attach;
 		ng_ether_detach_p = ng_ether_detach;
 		ng_ether_output_p = ng_ether_output;
 		ng_ether_input_p = ng_ether_input;
 		ng_ether_input_orphan_p = ng_ether_input_orphan;
 		ng_ether_link_state_p = ng_ether_link_state;
 
 		ng_ether_ifnet_arrival_cookie =
 		    EVENTHANDLER_REGISTER(ifnet_arrival_event,
 		    ng_ether_ifnet_arrival_event, NULL, EVENTHANDLER_PRI_ANY);
 		break;
 
 	case MOD_UNLOAD:
 
 		/*
 		 * Note that the base code won't try to unload us until
 		 * all nodes have been removed, and that can't happen
 		 * until all Ethernet interfaces are removed. In any
 		 * case, we know there are no nodes left if the action
 		 * is MOD_UNLOAD, so there's no need to detach any nodes.
 		 */
 
 		EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
 		    ng_ether_ifnet_arrival_cookie);
 
 		/* Unregister function hooks */
 		ng_ether_attach_p = NULL;
 		ng_ether_detach_p = NULL;
 		ng_ether_output_p = NULL;
 		ng_ether_input_p = NULL;
 		ng_ether_input_orphan_p = NULL;
 		ng_ether_link_state_p = NULL;
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static void
 vnet_ng_ether_init(const void *unused)
 {
 	struct ifnet *ifp;
 
 	/* If module load was rejected, don't attach to vnets. */
 	if (ng_ether_attach_p != ng_ether_attach)
 		return;
 
 	/* Create nodes for any already-existing Ethernet interfaces. */
 	IFNET_RLOCK();
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp->if_type == IFT_ETHER ||
 		    ifp->if_type == IFT_L2VLAN ||
 		    ifp->if_type == IFT_BRIDGE)
 			ng_ether_attach(ifp);
 	}
 	IFNET_RUNLOCK();
 }
 VNET_SYSINIT(vnet_ng_ether_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_ng_ether_init, NULL);
diff --git a/sys/netgraph/ng_gif.c b/sys/netgraph/ng_gif.c
index d4cb8922d1d6..aa2e57384b1e 100644
--- a/sys/netgraph/ng_gif.c
+++ b/sys/netgraph/ng_gif.c
@@ -1,595 +1,596 @@
 /*
  * ng_gif.c
  */
 
 /*-
  * SPDX-License-Identifier: BSD-3-Clause AND BSD-2-Clause
  *
  * Copyright 2001 The Aerospace Corporation.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions, and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of The Aerospace Corporation may not be used to endorse or
  *    promote products derived from this software.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AEROSPACE CORPORATION ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AEROSPACE CORPORATION BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *
  * Copyright (c) 1996-2000 Whistle Communications, Inc.
  * All rights reserved.
  * 
  * Subject to the following obligations and disclaimer of warranty, use and
  * redistribution of this software, in source or object code forms, with or
  * without modifications are expressly permitted by Whistle Communications;
  * provided, however, that:
  * 1. Any and all reproductions of the source or object code must include the
  *    copyright notice above and the following disclaimer of warranties; and
  * 2. No rights are granted, in any manner or form, to use Whistle
  *    Communications, Inc. trademarks, including the mark "WHISTLE
  *    COMMUNICATIONS" on advertising, endorsements, or otherwise except as
  *    such appears in the above copyright notice or in the software.
  * 
  * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
  * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
  * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
  * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
  * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
  * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
  * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
  * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
  * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
  * OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * ng_gif(4) netgraph node type
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_gif.h>
+#include <net/if_private.h>
 #include <net/vnet.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/ng_parse.h>
 #include <netgraph/ng_gif.h>
 
 #define IFP2NG(ifp)  ((struct ng_node *)((struct gif_softc *)(ifp->if_softc))->gif_netgraph)
 #define IFP2NG_SET(ifp, val)  (((struct gif_softc *)(ifp->if_softc))->gif_netgraph = (val))
 
 /* Per-node private data */
 struct private {
 	struct ifnet	*ifp;		/* associated interface */
 	hook_p		lower;		/* lower OR orphan hook connection */
 	u_char		lowerOrphan;	/* whether lower is lower or orphan */
 };
 typedef struct private *priv_p;
 
 /* Functional hooks called from if_gif.c */
 static void	ng_gif_input(struct ifnet *ifp, struct mbuf **mp, int af);
 static void	ng_gif_input_orphan(struct ifnet *ifp, struct mbuf *m, int af);
 static void	ng_gif_attach(struct ifnet *ifp);
 static void	ng_gif_detach(struct ifnet *ifp); 
 
 /* Other functions */
 static void	ng_gif_input2(node_p node, struct mbuf **mp, int af);
 static int	ng_gif_glue_af(struct mbuf **mp, int af);
 static int	ng_gif_rcv_lower(node_p node, struct mbuf *m);
 
 /* Netgraph node methods */
 static ng_constructor_t	ng_gif_constructor;
 static ng_rcvmsg_t	ng_gif_rcvmsg;
 static ng_shutdown_t	ng_gif_shutdown;
 static ng_newhook_t	ng_gif_newhook;
 static ng_connect_t	ng_gif_connect;
 static ng_rcvdata_t	ng_gif_rcvdata;
 static ng_disconnect_t	ng_gif_disconnect;
 static int		ng_gif_mod_event(module_t mod, int event, void *data);
 
 /* List of commands and how to convert arguments to/from ASCII */
 static const struct ng_cmdlist ng_gif_cmdlist[] = {
 	{
 	  NGM_GIF_COOKIE,
 	  NGM_GIF_GET_IFNAME,
 	  "getifname",
 	  NULL,
 	  &ng_parse_string_type
 	},
 	{
 	  NGM_GIF_COOKIE,
 	  NGM_GIF_GET_IFINDEX,
 	  "getifindex",
 	  NULL,
 	  &ng_parse_int32_type
 	},
 	{ 0 }
 };
 
 static struct ng_type ng_gif_typestruct = {
 	.version =	NG_ABI_VERSION,
 	.name =		NG_GIF_NODE_TYPE,
 	.mod_event =	ng_gif_mod_event,
 	.constructor =	ng_gif_constructor,
 	.rcvmsg =	ng_gif_rcvmsg,
 	.shutdown =	ng_gif_shutdown,
 	.newhook =	ng_gif_newhook,
 	.connect =	ng_gif_connect,
 	.rcvdata =	ng_gif_rcvdata,
 	.disconnect =	ng_gif_disconnect,
 	.cmdlist =	ng_gif_cmdlist,
 };
 MODULE_DEPEND(ng_gif, if_gif, 1,1,1);
 NETGRAPH_INIT(gif, &ng_gif_typestruct);
 
 /******************************************************************
 		       GIF FUNCTION HOOKS
 ******************************************************************/
 
 /*
  * Handle a packet that has come in on an interface. We get to
  * look at it here before any upper layer protocols do.
  */
 static void
 ng_gif_input(struct ifnet *ifp, struct mbuf **mp, int af)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	/* If "lower" hook not connected, let packet continue */
 	if (priv->lower == NULL || priv->lowerOrphan)
 		return;
 	ng_gif_input2(node, mp, af);
 }
 
 /*
  * Handle a packet that has come in on an interface, and which
  * does not match any of our known protocols (an ``orphan'').
  */
 static void
 ng_gif_input_orphan(struct ifnet *ifp, struct mbuf *m, int af)
 {
 	const node_p node = IFP2NG(ifp);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	/* If "orphan" hook not connected, let packet continue */
 	if (priv->lower == NULL || !priv->lowerOrphan) {
 		m_freem(m);
 		return;
 	}
 	ng_gif_input2(node, &m, af);
 	if (m != NULL)
 		m_freem(m);
 }
 
 /*
  * Handle a packet that has come in on a gif interface.
  * Attach the address family to the mbuf for later use.
  */
 static void
 ng_gif_input2(node_p node, struct mbuf **mp, int af)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	int error;
 
 	/* Glue address family on */
 	if ((error = ng_gif_glue_af(mp, af)) != 0)
 		return;
 
 	/* Send out lower/orphan hook */
 	NG_SEND_DATA_ONLY(error, priv->lower, *mp);
 	*mp = NULL;
 }
 
 /*
  * A new gif interface has been attached.
  * Create a new node for it, etc.
  */
 static void
 ng_gif_attach(struct ifnet *ifp)
 {
 	priv_p priv;
 	node_p node;
 
 	/* Create node */
 	KASSERT(!IFP2NG(ifp), ("%s: node already exists?", __func__));
 	if (ng_make_node_common(&ng_gif_typestruct, &node) != 0) {
 		log(LOG_ERR, "%s: can't %s for %s\n",
 		    __func__, "create node", ifp->if_xname);
 		return;
 	}
 
 	/* Allocate private data */
 	priv = malloc(sizeof(*priv), M_NETGRAPH, M_NOWAIT | M_ZERO);
 	if (priv == NULL) {
 		log(LOG_ERR, "%s: can't %s for %s\n",
 		    __func__, "allocate memory", ifp->if_xname);
 		NG_NODE_UNREF(node);
 		return;
 	}
 	NG_NODE_SET_PRIVATE(node, priv);
 	priv->ifp = ifp;
 	IFP2NG_SET(ifp, node);
 
 	/* Try to give the node the same name as the interface */
 	if (ng_name_node(node, ifp->if_xname) != 0) {
 		log(LOG_WARNING, "%s: can't name node %s\n",
 		    __func__, ifp->if_xname);
 	}
 }
 
 /*
  * An interface is being detached.
  * REALLY Destroy its node.
  */
 static void
 ng_gif_detach(struct ifnet *ifp)
 {
 	const node_p node = IFP2NG(ifp);
 	priv_p priv;
 
 	if (node == NULL)		/* no node (why not?), ignore */
 		return;
 	priv = NG_NODE_PRIVATE(node);
 	NG_NODE_REALLY_DIE(node);	/* Force real removal of node */
 	/*
 	 * We can't assume the ifnet is still around when we run shutdown
 	 * So zap it now. XXX We HOPE that anything running at this time
 	 * handles it (as it should in the non netgraph case).
 	 */
 	IFP2NG_SET(ifp, NULL);
 	priv->ifp = NULL;	/* XXX race if interrupted an output packet */
 	ng_rmnode_self(node);		/* remove all netgraph parts */
 }
 
 /*
  * Optimization for gluing the address family onto
  * the front of an incoming packet.
  */
 static int
 ng_gif_glue_af(struct mbuf **mp, int af)
 {
 	struct mbuf *m = *mp;
 	int error = 0;
 	sa_family_t tmp_af;
 
 	tmp_af = (sa_family_t) af;
 
 	/*
 	 * XXX: should try to bring back some of the optimizations from
 	 * ng_ether.c
 	 */
 
 	/*
 	 * Doing anything more is likely to get more
 	 * expensive than it's worth..
 	 * it's probable that everything else is in one
 	 * big lump. The next node will do an m_pullup()
 	 * for exactly the amount of data it needs and
 	 * hopefully everything after that will not
 	 * need one. So let's just use M_PREPEND.
 	 */
 	M_PREPEND(m, sizeof (tmp_af), M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto done;
 	}
 
 #if 0
 copy:
 #endif
 	/* Copy header and return (possibly new) mbuf */
 	*mtod(m, sa_family_t *) = tmp_af;
 #if 0
 	bcopy((caddr_t)&tmp_af, mtod(m, sa_family_t *), sizeof(tmp_af));
 #endif
 done:
 	*mp = m;
 	return error;
 }
 
 /******************************************************************
 		    NETGRAPH NODE METHODS
 ******************************************************************/
 
 /*
  * It is not possible or allowable to create a node of this type.
  * Nodes get created when the interface is attached (or, when
  * this node type's KLD is loaded).
  */
 static int
 ng_gif_constructor(node_p node)
 {
 	return (EINVAL);
 }
 
 /*
  * Check for attaching a new hook.
  */
 static	int
 ng_gif_newhook(node_p node, hook_p hook, const char *name)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	u_char orphan = priv->lowerOrphan;
 	hook_p *hookptr;
 
 	/* Divert hook is an alias for lower */
 	if (strcmp(name, NG_GIF_HOOK_DIVERT) == 0)
 		name = NG_GIF_HOOK_LOWER;
 
 	/* Which hook? */
 	if (strcmp(name, NG_GIF_HOOK_LOWER) == 0) {
 		hookptr = &priv->lower;
 		orphan = 0;
 	} else if (strcmp(name, NG_GIF_HOOK_ORPHAN) == 0) {
 		hookptr = &priv->lower;
 		orphan = 1;
 	} else
 		return (EINVAL);
 
 	/* Check if already connected (shouldn't be, but doesn't hurt) */
 	if (*hookptr != NULL)
 		return (EISCONN);
 
 	/* OK */
 	*hookptr = hook;
 	priv->lowerOrphan = orphan;
 	return (0);
 }
 
 /*
  * Hooks are attached, adjust to force queueing.
  * We don't really care which hook it is.
  * they should all be queuing for outgoing data.
  */
 static	int
 ng_gif_connect(hook_p hook)
 {
 	NG_HOOK_FORCE_QUEUE(NG_HOOK_PEER(hook));
 	return (0);
 }
 
 /*
  * Receive an incoming control message.
  */
 static int
 ng_gif_rcvmsg(node_p node, item_p item, hook_p lasthook)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ng_mesg *resp = NULL;
 	int error = 0;
 	struct ng_mesg *msg;
 
 	NGI_GET_MSG(item, msg);
 	switch (msg->header.typecookie) {
 	case NGM_GIF_COOKIE:
 		switch (msg->header.cmd) {
 		case NGM_GIF_GET_IFNAME:
 			NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			strlcpy(resp->data, priv->ifp->if_xname, IFNAMSIZ);
 			break;
 		case NGM_GIF_GET_IFINDEX:
 			NG_MKRESPONSE(resp, msg, sizeof(u_int32_t), M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			*((u_int32_t *)resp->data) = priv->ifp->if_index;
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	NG_RESPOND_MSG(error, node, item, resp);
 	NG_FREE_MSG(msg);
 	return (error);
 }
 
 /*
  * Receive data on a hook.
  */
 static int
 ng_gif_rcvdata(hook_p hook, item_p item)
 {
 	const node_p node = NG_HOOK_NODE(hook);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct mbuf *m;
 
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 
 	if (hook == priv->lower)
 		return ng_gif_rcv_lower(node, m);
 	panic("%s: weird hook", __func__);
 }
 
 /*
  * Handle an mbuf received on the "lower" hook.
  */
 static int
 ng_gif_rcv_lower(node_p node, struct mbuf *m)
 {
 	struct sockaddr	dst;
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	bzero(&dst, sizeof(dst));
 
 	/* Make sure header is fully pulled up */
 	if (m->m_pkthdr.len < sizeof(sa_family_t)) {
 		NG_FREE_M(m);
 		return (EINVAL);
 	}
 	if (m->m_len < sizeof(sa_family_t)
 	    && (m = m_pullup(m, sizeof(sa_family_t))) == NULL) {
 		return (ENOBUFS);
 	}
 
 	dst.sa_family = *mtod(m, sa_family_t *);
 	m_adj(m, sizeof(sa_family_t));
 
 	/* Send it on its way */
 	/*
 	 * XXX: gif_output only uses dst for the family and passes the
 	 * fourth argument (rt) to in{,6}_gif_output which ignore it.
 	 * If this changes ng_gif will probably break.
 	 */
 	return gif_output(priv->ifp, m, &dst, NULL);
 }
 
 /*
  * Shutdown node. This resets the node but does not remove it
  * unless the REALLY_DIE flag is set.
  */
 static int
 ng_gif_shutdown(node_p node)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	if (node->nd_flags & NGF_REALLY_DIE) {
 		/*
 		 * WE came here because the gif interface is being destroyed,
 		 * so stop being persistent.
 		 * Actually undo all the things we did on creation.
 		 * Assume the ifp has already been freed.
 		 */
 		NG_NODE_SET_PRIVATE(node, NULL);
 		free(priv, M_NETGRAPH);		
 		NG_NODE_UNREF(node);	/* free node itself */
 		return (0);
 	}
 	NG_NODE_REVIVE(node);		/* Signal ng_rmnode we are persisant */
 	return (0);
 }
 
 /*
  * Hook disconnection.
  */
 static int
 ng_gif_disconnect(hook_p hook)
 {
 	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 
 	if (hook == priv->lower) {
 		priv->lower = NULL;
 		priv->lowerOrphan = 0;
 	} else 
 		panic("%s: weird hook", __func__);
 	if ((NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0)
 	    && (NG_NODE_IS_VALID(NG_HOOK_NODE(hook))))
 		ng_rmnode_self(NG_HOOK_NODE(hook));	/* reset node */
 
 	return (0);
 }
 
 /******************************************************************
 		    	INITIALIZATION
 ******************************************************************/
 
 /*
  * Handle loading and unloading for this node type.
  */
 static int
 ng_gif_mod_event(module_t mod, int event, void *data)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct ifnet *ifp;
 	int error = 0;
 
 	switch (event) {
 	case MOD_LOAD:
 
 		/* Register function hooks */
 		if (ng_gif_attach_p != NULL) {
 			error = EEXIST;
 			break;
 		}
 		ng_gif_attach_p = ng_gif_attach;
 		ng_gif_detach_p = ng_gif_detach;
 		ng_gif_input_p = ng_gif_input;
 		ng_gif_input_orphan_p = ng_gif_input_orphan;
 
 		/* Create nodes for any already-existing gif interfaces */
 		VNET_LIST_RLOCK();
 		IFNET_RLOCK();
 		VNET_FOREACH(vnet_iter) {
 			CURVNET_SET_QUIET(vnet_iter); /* XXX revisit quiet */
 			CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 				if (ifp->if_type == IFT_GIF)
 					ng_gif_attach(ifp);
 			}
 			CURVNET_RESTORE();
 		}
 		IFNET_RUNLOCK();
 		VNET_LIST_RUNLOCK();
 		break;
 
 	case MOD_UNLOAD:
 
 		/*
 		 * Note that the base code won't try to unload us until
 		 * all nodes have been removed, and that can't happen
 		 * until all gif interfaces are destroyed. In any
 		 * case, we know there are no nodes left if the action
 		 * is MOD_UNLOAD, so there's no need to detach any nodes.
 		 *
 		 * XXX: what about manual unloads?!?
 		 */
 
 		/* Unregister function hooks */
 		ng_gif_attach_p = NULL;
 		ng_gif_detach_p = NULL;
 		ng_gif_input_p = NULL;
 		ng_gif_input_orphan_p = NULL;
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
diff --git a/sys/netgraph/ng_iface.c b/sys/netgraph/ng_iface.c
index e6871435fa88..2ba7a788633f 100644
--- a/sys/netgraph/ng_iface.c
+++ b/sys/netgraph/ng_iface.c
@@ -1,818 +1,819 @@
 /*
  * ng_iface.c
  */
 
 /*-
  * Copyright (c) 1996-1999 Whistle Communications, Inc.
  * All rights reserved.
  * 
  * Subject to the following obligations and disclaimer of warranty, use and
  * redistribution of this software, in source or object code forms, with or
  * without modifications are expressly permitted by Whistle Communications;
  * provided, however, that:
  * 1. Any and all reproductions of the source or object code must include the
  *    copyright notice above and the following disclaimer of warranties; and
  * 2. No rights are granted, in any manner or form, to use Whistle
  *    Communications, Inc. trademarks, including the mark "WHISTLE
  *    COMMUNICATIONS" on advertising, endorsements, or otherwise except as
  *    such appears in the above copyright notice or in the software.
  * 
  * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
  * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
  * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
  * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
  * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
  * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
  * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
  * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
  * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
  * OF SUCH DAMAGE.
  *
  * Author: Archie Cobbs <archie@freebsd.org>
  *
  * $FreeBSD$
  * $Whistle: ng_iface.c,v 1.33 1999/11/01 09:24:51 julian Exp $
  */
 
 /*
  * This node is also a system networking interface. It has
  * a hook for each protocol (IP, AppleTalk, etc). Packets
  * are simply relayed between the interface and the hooks.
  *
  * Interfaces are named ng0, ng1, etc.  New nodes take the
  * first available interface name.
  *
  * This node also includes Berkeley packet filter support.
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/rmlock.h>
 #include <sys/sockio.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/libkern.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/bpf.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/ng_parse.h>
 #include <netgraph/ng_iface.h>
 
 #ifdef NG_SEPARATE_MALLOC
 static MALLOC_DEFINE(M_NETGRAPH_IFACE, "netgraph_iface", "netgraph iface node");
 #else
 #define M_NETGRAPH_IFACE M_NETGRAPH
 #endif
 
 static SYSCTL_NODE(_net_graph, OID_AUTO, iface, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Point to point netgraph interface");
 VNET_DEFINE_STATIC(int, ng_iface_max_nest) = 2;
 #define	V_ng_iface_max_nest	VNET(ng_iface_max_nest)
 SYSCTL_INT(_net_graph_iface, OID_AUTO, max_nesting, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ng_iface_max_nest), 0, "Max nested tunnels");
 
 /* This struct describes one address family */
 struct iffam {
 	sa_family_t	family;		/* Address family */
 	const char	*hookname;	/* Name for hook */
 };
 typedef const struct iffam *iffam_p;
 
 /* List of address families supported by our interface */
 const static struct iffam gFamilies[] = {
 	{ AF_INET,	NG_IFACE_HOOK_INET	},
 	{ AF_INET6,	NG_IFACE_HOOK_INET6	},
 };
 #define	NUM_FAMILIES		nitems(gFamilies)
 
 /* Node private data */
 struct ng_iface_private {
 	struct	ifnet *ifp;		/* Our interface */
 	int	unit;			/* Interface unit number */
 	node_p	node;			/* Our netgraph node */
 	hook_p	hooks[NUM_FAMILIES];	/* Hook for each address family */
 	struct rmlock	lock;		/* Protect private data changes */
 };
 typedef struct ng_iface_private *priv_p;
 
 #define	PRIV_RLOCK(priv, t)	rm_rlock(&priv->lock, t)
 #define	PRIV_RUNLOCK(priv, t)	rm_runlock(&priv->lock, t)
 #define	PRIV_WLOCK(priv)	rm_wlock(&priv->lock)
 #define	PRIV_WUNLOCK(priv)	rm_wunlock(&priv->lock)
 
 /* Interface methods */
 static void	ng_iface_start(struct ifnet *ifp);
 static int	ng_iface_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data);
 static int	ng_iface_output(struct ifnet *ifp, struct mbuf *m0,
     			const struct sockaddr *dst, struct route *ro);
 static void	ng_iface_bpftap(struct ifnet *ifp,
 			struct mbuf *m, sa_family_t family);
 static int	ng_iface_send(struct ifnet *ifp, struct mbuf *m,
 			sa_family_t sa);
 #ifdef DEBUG
 static void	ng_iface_print_ioctl(struct ifnet *ifp, int cmd, caddr_t data);
 #endif
 
 /* Netgraph methods */
 static int		ng_iface_mod_event(module_t, int, void *);
 static ng_constructor_t	ng_iface_constructor;
 static ng_rcvmsg_t	ng_iface_rcvmsg;
 static ng_shutdown_t	ng_iface_shutdown;
 static ng_newhook_t	ng_iface_newhook;
 static ng_rcvdata_t	ng_iface_rcvdata;
 static ng_disconnect_t	ng_iface_disconnect;
 
 /* Helper stuff */
 static iffam_p	get_iffam_from_af(sa_family_t family);
 static iffam_p	get_iffam_from_hook(priv_p priv, hook_p hook);
 static iffam_p	get_iffam_from_name(const char *name);
 static hook_p  *get_hook_from_iffam(priv_p priv, iffam_p iffam);
 
 /* List of commands and how to convert arguments to/from ASCII */
 static const struct ng_cmdlist ng_iface_cmds[] = {
 	{
 	  NGM_IFACE_COOKIE,
 	  NGM_IFACE_GET_IFNAME,
 	  "getifname",
 	  NULL,
 	  &ng_parse_string_type
 	},
 	{
 	  NGM_IFACE_COOKIE,
 	  NGM_IFACE_POINT2POINT,
 	  "point2point",
 	  NULL,
 	  NULL
 	},
 	{
 	  NGM_IFACE_COOKIE,
 	  NGM_IFACE_BROADCAST,
 	  "broadcast",
 	  NULL,
 	  NULL
 	},
 	{
 	  NGM_IFACE_COOKIE,
 	  NGM_IFACE_GET_IFINDEX,
 	  "getifindex",
 	  NULL,
 	  &ng_parse_uint32_type
 	},
 	{ 0 }
 };
 
 /* Node type descriptor */
 static struct ng_type typestruct = {
 	.version =	NG_ABI_VERSION,
 	.name =		NG_IFACE_NODE_TYPE,
 	.mod_event =	ng_iface_mod_event,
 	.constructor =	ng_iface_constructor,
 	.rcvmsg =	ng_iface_rcvmsg,
 	.shutdown =	ng_iface_shutdown,
 	.newhook =	ng_iface_newhook,
 	.rcvdata =	ng_iface_rcvdata,
 	.disconnect =	ng_iface_disconnect,
 	.cmdlist =	ng_iface_cmds,
 };
 NETGRAPH_INIT(iface, &typestruct);
 
 VNET_DEFINE_STATIC(struct unrhdr *, ng_iface_unit);
 #define	V_ng_iface_unit			VNET(ng_iface_unit)
 
 /************************************************************************
 			HELPER STUFF
  ************************************************************************/
 
 /*
  * Get the family descriptor from the family ID
  */
 static __inline iffam_p
 get_iffam_from_af(sa_family_t family)
 {
 	iffam_p iffam;
 	int k;
 
 	for (k = 0; k < NUM_FAMILIES; k++) {
 		iffam = &gFamilies[k];
 		if (iffam->family == family)
 			return (iffam);
 	}
 	return (NULL);
 }
 
 /*
  * Get the family descriptor from the hook
  */
 static __inline iffam_p
 get_iffam_from_hook(priv_p priv, hook_p hook)
 {
 	int k;
 
 	for (k = 0; k < NUM_FAMILIES; k++)
 		if (priv->hooks[k] == hook)
 			return (&gFamilies[k]);
 	return (NULL);
 }
 
 /*
  * Get the hook from the iffam descriptor
  */
 
 static __inline hook_p *
 get_hook_from_iffam(priv_p priv, iffam_p iffam)
 {
 	return (&priv->hooks[iffam - gFamilies]);
 }
 
 /*
  * Get the iffam descriptor from the name
  */
 static __inline iffam_p
 get_iffam_from_name(const char *name)
 {
 	iffam_p iffam;
 	int k;
 
 	for (k = 0; k < NUM_FAMILIES; k++) {
 		iffam = &gFamilies[k];
 		if (!strcmp(iffam->hookname, name))
 			return (iffam);
 	}
 	return (NULL);
 }
 
 /************************************************************************
 			INTERFACE STUFF
  ************************************************************************/
 
 /*
  * Process an ioctl for the virtual interface
  */
 static int
 ng_iface_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct ifreq *const ifr = (struct ifreq *) data;
 	int error = 0;
 
 #ifdef DEBUG
 	ng_iface_print_ioctl(ifp, command, data);
 #endif
 	switch (command) {
 	/* These two are mostly handled at a higher layer */
 	case SIOCSIFADDR:
 		ifp->if_flags |= IFF_UP;
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE);
 		break;
 	case SIOCGIFADDR:
 		break;
 
 	/* Set flags */
 	case SIOCSIFFLAGS:
 		/*
 		 * If the interface is marked up and stopped, then start it.
 		 * If it is marked down and running, then stop it.
 		 */
 		if (ifr->ifr_flags & IFF_UP) {
 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 				ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE);
 				ifp->if_drv_flags |= IFF_DRV_RUNNING;
 			}
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 				ifp->if_drv_flags &= ~(IFF_DRV_RUNNING |
 				    IFF_DRV_OACTIVE);
 		}
 		break;
 
 	/* Set the interface MTU */
 	case SIOCSIFMTU:
 		if (ifr->ifr_mtu > NG_IFACE_MTU_MAX
 		    || ifr->ifr_mtu < NG_IFACE_MTU_MIN)
 			error = EINVAL;
 		else
 			ifp->if_mtu = ifr->ifr_mtu;
 		break;
 
 	/* Stuff that's not supported */
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		error = 0;
 		break;
 	case SIOCSIFPHYS:
 		error = EOPNOTSUPP;
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 /*
  * This routine is called to deliver a packet out the interface.
  * We simply look at the address family and relay the packet to
  * the corresponding hook, if it exists and is connected.
  */
 
 static int
 ng_iface_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 	uint32_t af;
 	int error;
 
 	/* Check interface flags */
 	if (!((ifp->if_flags & IFF_UP) &&
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING))) {
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/* Protect from deadly infinite recursion. */
 	error = if_tunnel_check_nesting(ifp, m, NGM_IFACE_COOKIE,
 	    V_ng_iface_max_nest);
 	if (error) {
 		m_freem(m);
 		return (error);
 	}
 
 	/* BPF writes need to be handled specially. */
 	if (dst->sa_family == AF_UNSPEC)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = RO_GET_FAMILY(ro, dst);
 
 	/* Berkeley packet filter */
 	ng_iface_bpftap(ifp, m, af);
 
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		M_PREPEND(m, sizeof(sa_family_t), M_NOWAIT);
 		if (m == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 			return (ENOBUFS);
 		}
 		*(sa_family_t *)m->m_data = af;
 		error = (ifp->if_transmit)(ifp, m);
 	} else
 		error = ng_iface_send(ifp, m, af);
 
 	return (error);
 }
 
 /*
  * Start method is used only when ALTQ is enabled.
  */
 static void
 ng_iface_start(struct ifnet *ifp)
 {
 	struct mbuf *m;
 	sa_family_t sa;
 
 	KASSERT(ALTQ_IS_ENABLED(&ifp->if_snd), ("%s without ALTQ", __func__));
 
 	for(;;) {
 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL)
 			break;
 		sa = *mtod(m, sa_family_t *);
 		m_adj(m, sizeof(sa_family_t));
 		ng_iface_send(ifp, m, sa);
 	}
 }
 
 /*
  * Flash a packet by the BPF (requires prepending 4 byte AF header)
  * Note the phoney mbuf; this is OK because BPF treats it read-only.
  */
 static void
 ng_iface_bpftap(struct ifnet *ifp, struct mbuf *m, sa_family_t family)
 {
 	KASSERT(family != AF_UNSPEC, ("%s: family=AF_UNSPEC", __func__));
 	if (bpf_peers_present(ifp->if_bpf)) {
 		int32_t family4 = (int32_t)family;
 		bpf_mtap2(ifp->if_bpf, &family4, sizeof(family4), m);
 	}
 }
 
 /*
  * This routine does actual delivery of the packet into the
  * netgraph(4). It is called from ng_iface_start() and
  * ng_iface_output().
  */
 static int
 ng_iface_send(struct ifnet *ifp, struct mbuf *m, sa_family_t sa)
 {
 	struct rm_priotracker priv_tracker;
 	const priv_p priv = (priv_p) ifp->if_softc;
 	const iffam_p iffam = get_iffam_from_af(sa);
 	hook_p hook;
 	int error;
 	int len;
 
 	/* Check address family to determine hook (if known) */
 	if (iffam == NULL) {
 		m_freem(m);
 		log(LOG_WARNING, "%s: can't handle af%d\n", ifp->if_xname, sa);
 		return (EAFNOSUPPORT);
 	}
 
 	/* Copy length before the mbuf gets invalidated. */
 	len = m->m_pkthdr.len;
 
 	PRIV_RLOCK(priv, &priv_tracker);
 	hook = *get_hook_from_iffam(priv, iffam);
 	if (hook == NULL) {
 		NG_FREE_M(m);
 		PRIV_RUNLOCK(priv, &priv_tracker);
 		return ENETDOWN;
 	}
 	NG_HOOK_REF(hook);
 	PRIV_RUNLOCK(priv, &priv_tracker);
 
 	NG_OUTBOUND_THREAD_REF();
 	NG_SEND_DATA_ONLY(error, hook, m);
 	NG_OUTBOUND_THREAD_UNREF();
 	NG_HOOK_UNREF(hook);
 
 	/* Update stats. */
 	if (error == 0) {
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	}
 
 	return (error);
 }
 
 #ifdef DEBUG
 /*
  * Display an ioctl to the virtual interface
  */
 
 static void
 ng_iface_print_ioctl(struct ifnet *ifp, int command, caddr_t data)
 {
 	char   *str;
 
 	switch (command & IOC_DIRMASK) {
 	case IOC_VOID:
 		str = "IO";
 		break;
 	case IOC_OUT:
 		str = "IOR";
 		break;
 	case IOC_IN:
 		str = "IOW";
 		break;
 	case IOC_INOUT:
 		str = "IORW";
 		break;
 	default:
 		str = "IO??";
 	}
 	log(LOG_DEBUG, "%s: %s('%c', %d, char[%d])\n",
 	       ifp->if_xname,
 	       str,
 	       IOCGROUP(command),
 	       command & 0xff,
 	       IOCPARM_LEN(command));
 }
 #endif /* DEBUG */
 
 /************************************************************************
 			NETGRAPH NODE STUFF
  ************************************************************************/
 
 /*
  * Constructor for a node
  */
 static int
 ng_iface_constructor(node_p node)
 {
 	struct ifnet *ifp;
 	priv_p priv;
 
 	/* Allocate node and interface private structures */
 	priv = malloc(sizeof(*priv), M_NETGRAPH_IFACE, M_WAITOK | M_ZERO);
 	ifp = if_alloc(IFT_PROPVIRTUAL);
 	if (ifp == NULL) {
 		free(priv, M_NETGRAPH_IFACE);
 		return (ENOMEM);
 	}
 
 	rm_init(&priv->lock, "ng_iface private rmlock");
 
 	/* Link them together */
 	ifp->if_softc = priv;
 	priv->ifp = ifp;
 
 	/* Get an interface unit number */
 	priv->unit = alloc_unr(V_ng_iface_unit);
 
 	/* Link together node and private info */
 	NG_NODE_SET_PRIVATE(node, priv);
 	priv->node = node;
 
 	/* Initialize interface structure */
 	if_initname(ifp, NG_IFACE_IFACE_NAME, priv->unit);
 	ifp->if_output = ng_iface_output;
 	ifp->if_start = ng_iface_start;
 	ifp->if_ioctl = ng_iface_ioctl;
 	ifp->if_mtu = NG_IFACE_MTU_DEFAULT;
 	ifp->if_flags = (IFF_SIMPLEX|IFF_POINTOPOINT|IFF_NOARP|IFF_MULTICAST);
 	ifp->if_type = IFT_PROPVIRTUAL;		/* XXX */
 	ifp->if_addrlen = 0;			/* XXX */
 	ifp->if_hdrlen = 0;			/* XXX */
 	ifp->if_baudrate = 64000;		/* XXX */
 	IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
 	ifp->if_snd.ifq_drv_maxlen = ifqmaxlen;
 	IFQ_SET_READY(&ifp->if_snd);
 
 	/* Give this node the same name as the interface (if possible) */
 	if (ng_name_node(node, ifp->if_xname) != 0)
 		log(LOG_WARNING, "%s: can't acquire netgraph name\n",
 		    ifp->if_xname);
 
 	/* Attach the interface */
 	if_attach(ifp);
 	bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
 
 	/* Done */
 	return (0);
 }
 
 /*
  * Give our ok for a hook to be added
  */
 static int
 ng_iface_newhook(node_p node, hook_p hook, const char *name)
 {
 	const iffam_p iffam = get_iffam_from_name(name);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	hook_p *hookptr;
 
 	if (iffam == NULL)
 		return (EPFNOSUPPORT);
 	PRIV_WLOCK(priv);
 	hookptr = get_hook_from_iffam(priv, iffam);
 	if (*hookptr != NULL) {
 		PRIV_WUNLOCK(priv);
 		return (EISCONN);
 	}
 	*hookptr = hook;
 	NG_HOOK_HI_STACK(hook);
 	NG_HOOK_SET_TO_INBOUND(hook);
 	PRIV_WUNLOCK(priv);
 	return (0);
 }
 
 /*
  * Receive a control message
  */
 static int
 ng_iface_rcvmsg(node_p node, item_p item, hook_p lasthook)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ifnet *const ifp = priv->ifp;
 	struct ng_mesg *resp = NULL;
 	int error = 0;
 	struct ng_mesg *msg;
 
 	NGI_GET_MSG(item, msg);
 	switch (msg->header.typecookie) {
 	case NGM_IFACE_COOKIE:
 		switch (msg->header.cmd) {
 		case NGM_IFACE_GET_IFNAME:
 			NG_MKRESPONSE(resp, msg, IFNAMSIZ, M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			strlcpy(resp->data, ifp->if_xname, IFNAMSIZ);
 			break;
 
 		case NGM_IFACE_POINT2POINT:
 		case NGM_IFACE_BROADCAST:
 		    {
 			/* Deny request if interface is UP */
 			if ((ifp->if_flags & IFF_UP) != 0)
 				return (EBUSY);
 
 			/* Change flags */
 			switch (msg->header.cmd) {
 			case NGM_IFACE_POINT2POINT:
 				ifp->if_flags |= IFF_POINTOPOINT;
 				ifp->if_flags &= ~IFF_BROADCAST;
 				break;
 			case NGM_IFACE_BROADCAST:
 				ifp->if_flags &= ~IFF_POINTOPOINT;
 				ifp->if_flags |= IFF_BROADCAST;
 				break;
 			}
 			break;
 		    }
 
 		case NGM_IFACE_GET_IFINDEX:
 			NG_MKRESPONSE(resp, msg, sizeof(uint32_t), M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				break;
 			}
 			*((uint32_t *)resp->data) = priv->ifp->if_index;
 			break;
 
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	case NGM_FLOW_COOKIE:
 		switch (msg->header.cmd) {
 		case NGM_LINK_IS_UP:
 			if_link_state_change(ifp, LINK_STATE_UP);
 			break;
 		case NGM_LINK_IS_DOWN:
 			if_link_state_change(ifp, LINK_STATE_DOWN);
 			break;
 		default:
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	NG_RESPOND_MSG(error, node, item, resp);
 	NG_FREE_MSG(msg);
 	return (error);
 }
 
 /*
  * Recive data from a hook. Pass the packet to the correct input routine.
  */
 static int
 ng_iface_rcvdata(hook_p hook, item_p item)
 {
 	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 	const iffam_p iffam = get_iffam_from_hook(priv, hook);
 	struct ifnet *const ifp = priv->ifp;
 	struct epoch_tracker et;
 	struct mbuf *m;
 	int isr;
 
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 	/* Sanity checks */
 	KASSERT(iffam != NULL, ("%s: iffam", __func__));
 	M_ASSERTPKTHDR(m);
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		NG_FREE_M(m);
 		return (ENETDOWN);
 	}
 
 	/* Update interface stats */
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 
 	/* Note receiving interface */
 	m->m_pkthdr.rcvif = ifp;
 
 	/* Berkeley packet filter */
 	ng_iface_bpftap(ifp, m, iffam->family);
 
 	/* Send packet */
 	switch (iffam->family) {
 #ifdef INET
 	case AF_INET:
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 	random_harvest_queue(m, sizeof(*m), RANDOM_NET_NG);
 	M_SETFIB(m, ifp->if_fib);
 	CURVNET_SET(ifp->if_vnet);
 	NET_EPOCH_ENTER(et);
 	netisr_dispatch(isr, m);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * Shutdown and remove the node and its associated interface.
  */
 static int
 ng_iface_shutdown(node_p node)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 
 	/*
 	 * The ifnet may be in a different vnet than the netgraph node, 
 	 * hence we have to change the current vnet context here.
 	 */
 	CURVNET_SET_QUIET(priv->ifp->if_vnet);
 	bpfdetach(priv->ifp);
 	if_detach(priv->ifp);
 	if_free(priv->ifp);
 	CURVNET_RESTORE();
 	priv->ifp = NULL;
 	free_unr(V_ng_iface_unit, priv->unit);
 	rm_destroy(&priv->lock);
 	free(priv, M_NETGRAPH_IFACE);
 	NG_NODE_SET_PRIVATE(node, NULL);
 	NG_NODE_UNREF(node);
 	return (0);
 }
 
 /*
  * Hook disconnection. Note that we do *not* shutdown when all
  * hooks have been disconnected.
  */
 static int
 ng_iface_disconnect(hook_p hook)
 {
 	const priv_p priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 	const iffam_p iffam = get_iffam_from_hook(priv, hook);
 
 	if (iffam == NULL)
 		panic("%s", __func__);
 	PRIV_WLOCK(priv);
 	*get_hook_from_iffam(priv, iffam) = NULL;
 	PRIV_WUNLOCK(priv);
 	return (0);
 }
 
 /*
  * Handle loading and unloading for this node type.
  */
 static int
 ng_iface_mod_event(module_t mod, int event, void *data)
 {
 	int error = 0;
 
 	switch (event) {
 	case MOD_LOAD:
 	case MOD_UNLOAD:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static void
 vnet_ng_iface_init(const void *unused)
 {
 
 	V_ng_iface_unit = new_unrhdr(0, 0xffff, NULL);
 }
 VNET_SYSINIT(vnet_ng_iface_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_ng_iface_init, NULL);
 
 static void
 vnet_ng_iface_uninit(const void *unused)
 {
 
 	delete_unrhdr(V_ng_iface_unit);
 }
 VNET_SYSUNINIT(vnet_ng_iface_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
     vnet_ng_iface_uninit, NULL);
diff --git a/sys/netgraph/ng_source.c b/sys/netgraph/ng_source.c
index 0eee9ceb25c5..c04f477cd7f2 100644
--- a/sys/netgraph/ng_source.c
+++ b/sys/netgraph/ng_source.c
@@ -1,932 +1,933 @@
 /*
  * ng_source.c
  */
 
 /*-
  * Copyright (c) 2005 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright 2002 Sandvine Inc.
  * All rights reserved.
  *
  * Subject to the following obligations and disclaimer of warranty, use and
  * redistribution of this software, in source or object code forms, with or
  * without modifications are expressly permitted by Sandvine Inc.; provided,
  * however, that:
  * 1. Any and all reproductions of the source or object code must include the
  *    copyright notice above and the following disclaimer of warranties; and
  * 2. No rights are granted, in any manner or form, to use Sandvine Inc.
  *    trademarks, including the mark "SANDVINE" on advertising, endorsements,
  *    or otherwise except as such appears in the above copyright notice or in
  *    the software.
  *
  * THIS SOFTWARE IS BEING PROVIDED BY SANDVINE "AS IS", AND TO THE MAXIMUM
  * EXTENT PERMITTED BY LAW, SANDVINE MAKES NO REPRESENTATIONS OR WARRANTIES,
  * EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, INCLUDING WITHOUT LIMITATION,
  * ANY AND ALL IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
  * PURPOSE, OR NON-INFRINGEMENT.  SANDVINE DOES NOT WARRANT, GUARANTEE, OR
  * MAKE ANY REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE
  * USE OF THIS SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY
  * OR OTHERWISE.  IN NO EVENT SHALL SANDVINE BE LIABLE FOR ANY DAMAGES
  * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
  * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF SANDVINE IS ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  *
  * Author: Dave Chapeskie
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * This node is used for high speed packet geneneration.  It queues
  * all data received on its 'input' hook and when told to start via
  * a control message it sends the packets out its 'output' hook.  In
  * this way this node can be preloaded with a packet stream which it
  * can then send continuously as fast as possible.
  *
  * Currently it just copies the mbufs as required.  It could do various
  * tricks to try and avoid this.  Probably the best performance would
  * be achieved by modifying the appropriate drivers to be told to
  * self-re-enqueue packets (e.g. the if_bge driver could reuse the same
  * transmit descriptors) under control of this node; perhaps via some
  * flag in the mbuf or some such.  The node could peek at an appropriate
  * ifnet flag to see if such support is available for the connected
  * interface.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/ng_parse.h>
 #include <netgraph/ng_ether.h>
 #include <netgraph/ng_source.h>
 
 #define NG_SOURCE_INTR_TICKS		1
 #define NG_SOURCE_DRIVER_IFQ_MAXLEN	(4*1024)
 
 #define	mtod_off(m,off,t)	((t)(mtod((m),caddr_t)+(off)))
 
 /* Per node info */
 struct privdata {
 	node_p				node;
 	hook_p				input;
 	hook_p				output;
 	struct ng_source_stats		stats;
 	struct mbufq			snd_queue;	/* packets to send */
 	struct mbuf			*last_packet;	/* last pkt in queue */
 	struct ifnet			*output_ifp;
 	struct callout			intr_ch;
 	uint64_t			packets;	/* packets to send */
 	uint32_t			queueOctets;
 	struct ng_source_embed_info	embed_timestamp;
 	struct ng_source_embed_cnt_info	embed_counter[NG_SOURCE_COUNTERS];
 };
 typedef struct privdata *sc_p;
 
 /* Node flags */
 #define NG_SOURCE_ACTIVE	(NGF_TYPE1)
 
 /* Netgraph methods */
 static ng_constructor_t	ng_source_constructor;
 static ng_rcvmsg_t	ng_source_rcvmsg;
 static ng_shutdown_t	ng_source_rmnode;
 static ng_newhook_t	ng_source_newhook;
 static ng_connect_t	ng_source_connect;
 static ng_rcvdata_t	ng_source_rcvdata;
 static ng_disconnect_t	ng_source_disconnect;
 
 /* Other functions */
 static void		ng_source_intr(node_p, hook_p, void *, int);
 static void		ng_source_clr_data (sc_p);
 static int		ng_source_start (sc_p, uint64_t);
 static void		ng_source_stop (sc_p);
 static int		ng_source_send (sc_p, int, int *);
 static int		ng_source_store_output_ifp(sc_p, char *);
 static void		ng_source_packet_mod(sc_p, struct mbuf *,
 			    int, int, caddr_t, int);
 static void		ng_source_mod_counter(sc_p sc,
 			    struct ng_source_embed_cnt_info *cnt,
 			    struct mbuf *m, int increment);
 static int		ng_source_dup_mod(sc_p, struct mbuf *,
 			    struct mbuf **);
 
 /* Parse type for timeval */
 static const struct ng_parse_struct_field ng_source_timeval_type_fields[] = {
 #ifdef __i386__
 	{ "tv_sec",		&ng_parse_int32_type	},
 #else
 	{ "tv_sec",		&ng_parse_int64_type	},
 #endif
 #ifdef __LP64__
 	{ "tv_usec",		&ng_parse_int64_type	},
 #else
 	{ "tv_usec",		&ng_parse_int32_type	},
 #endif
 	{ NULL }
 };
 const struct ng_parse_type ng_source_timeval_type = {
 	&ng_parse_struct_type,
 	&ng_source_timeval_type_fields
 };
 
 /* Parse type for struct ng_source_stats */
 static const struct ng_parse_struct_field ng_source_stats_type_fields[]
 	= NG_SOURCE_STATS_TYPE_INFO;
 static const struct ng_parse_type ng_source_stats_type = {
 	&ng_parse_struct_type,
 	&ng_source_stats_type_fields
 };
 
 /* Parse type for struct ng_source_embed_info */
 static const struct ng_parse_struct_field ng_source_embed_type_fields[] =
 	NG_SOURCE_EMBED_TYPE_INFO;
 static const struct ng_parse_type ng_source_embed_type = {
 	&ng_parse_struct_type,
 	&ng_source_embed_type_fields
 };
 
 /* Parse type for struct ng_source_embed_cnt_info */
 static const struct ng_parse_struct_field ng_source_embed_cnt_type_fields[] =
 	NG_SOURCE_EMBED_CNT_TYPE_INFO;
 static const struct ng_parse_type ng_source_embed_cnt_type = {
 	&ng_parse_struct_type,
 	&ng_source_embed_cnt_type_fields
 };
 
 /* List of commands and how to convert arguments to/from ASCII */
 static const struct ng_cmdlist ng_source_cmds[] = {
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_GET_STATS,
 	  "getstats",
 	  NULL,
 	  &ng_source_stats_type
 	},
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_CLR_STATS,
 	  "clrstats",
 	  NULL,
 	  NULL
 	},
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_GETCLR_STATS,
 	  "getclrstats",
 	  NULL,
 	  &ng_source_stats_type
 	},
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_START,
 	  "start",
 	  &ng_parse_uint64_type,
 	  NULL
 	},
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_STOP,
 	  "stop",
 	  NULL,
 	  NULL
 	},
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_CLR_DATA,
 	  "clrdata",
 	  NULL,
 	  NULL
 	},
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_SETIFACE,
 	  "setiface",
 	  &ng_parse_string_type,
 	  NULL
 	},
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_SETPPS,
 	  "setpps",
 	  &ng_parse_uint32_type,
 	  NULL
 	},
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_SET_TIMESTAMP,
 	  "settimestamp",
 	  &ng_source_embed_type,
 	  NULL
 	},
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_GET_TIMESTAMP,
 	  "gettimestamp",
 	  NULL,
 	  &ng_source_embed_type
 	},
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_SET_COUNTER,
 	  "setcounter",
 	  &ng_source_embed_cnt_type,
 	  NULL
 	},
 	{
 	  NGM_SOURCE_COOKIE,
 	  NGM_SOURCE_GET_COUNTER,
 	  "getcounter",
 	  &ng_parse_uint8_type,
 	  &ng_source_embed_cnt_type
 	},
 	{ 0 }
 };
 
 /* Netgraph type descriptor */
 static struct ng_type ng_source_typestruct = {
 	.version =	NG_ABI_VERSION,
 	.name =		NG_SOURCE_NODE_TYPE,
 	.constructor =	ng_source_constructor,
 	.rcvmsg =	ng_source_rcvmsg,
 	.shutdown =	ng_source_rmnode,
 	.newhook =	ng_source_newhook,
 	.connect =	ng_source_connect,
 	.rcvdata =	ng_source_rcvdata,
 	.disconnect =	ng_source_disconnect,
 	.cmdlist =	ng_source_cmds,
 };
 NETGRAPH_INIT(source, &ng_source_typestruct);
 
 static int ng_source_set_autosrc(sc_p, uint32_t);
 
 /*
  * Node constructor
  */
 static int
 ng_source_constructor(node_p node)
 {
 	sc_p sc;
 
 	sc = malloc(sizeof(*sc), M_NETGRAPH, M_WAITOK | M_ZERO);
 
 	NG_NODE_SET_PRIVATE(node, sc);
 	sc->node = node;
 	mbufq_init(&sc->snd_queue, 2048);
 	ng_callout_init(&sc->intr_ch);
 
 	return (0);
 }
 
 /*
  * Add a hook
  */
 static int
 ng_source_newhook(node_p node, hook_p hook, const char *name)
 {
 	sc_p sc = NG_NODE_PRIVATE(node);
 
 	if (strcmp(name, NG_SOURCE_HOOK_INPUT) == 0) {
 		sc->input = hook;
 	} else if (strcmp(name, NG_SOURCE_HOOK_OUTPUT) == 0) {
 		sc->output = hook;
 		sc->output_ifp = NULL;
 		bzero(&sc->stats, sizeof(sc->stats));
 	} else
 		return (EINVAL);
 
 	return (0);
 }
 
 /*
  * Hook has been added
  */
 static int
 ng_source_connect(hook_p hook)
 {
 	sc_p sc = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 	struct ng_mesg *msg;
 	int dummy_error = 0;
 
 	/*
 	 * If this is "output" hook, then request information
 	 * from our downstream.
 	 */
 	if (hook == sc->output) {
 		NG_MKMESSAGE(msg, NGM_ETHER_COOKIE, NGM_ETHER_GET_IFNAME,
 		    0, M_NOWAIT);
 		if (msg == NULL)
 			return (ENOBUFS);
 
 		/*
 		 * Our hook and peer hook have HK_INVALID flag set,
 		 * so we can't use NG_SEND_MSG_HOOK() macro here.
 		 */
 		NG_SEND_MSG_ID(dummy_error, sc->node, msg,
 		    NG_NODE_ID(NG_PEER_NODE(sc->output)), NG_NODE_ID(sc->node));
 	}
 
 	return (0);
 }
 
 /*
  * Receive a control message
  */
 static int
 ng_source_rcvmsg(node_p node, item_p item, hook_p lasthook)
 {
 	sc_p sc = NG_NODE_PRIVATE(node);
 	struct ng_mesg *msg, *resp = NULL;
 	int error = 0;
 
 	NGI_GET_MSG(item, msg);
 
 	switch (msg->header.typecookie) {
 	case NGM_SOURCE_COOKIE:
 		if (msg->header.flags & NGF_RESP) {
 			error = EINVAL;
 			break;
 		}
 		switch (msg->header.cmd) {
 		case NGM_SOURCE_GET_STATS:
 		case NGM_SOURCE_CLR_STATS:
 		case NGM_SOURCE_GETCLR_STATS:
                     {
 			struct ng_source_stats *stats;
 
                         if (msg->header.cmd != NGM_SOURCE_CLR_STATS) {
                                 NG_MKRESPONSE(resp, msg,
                                     sizeof(*stats), M_NOWAIT);
 				if (resp == NULL) {
 					error = ENOMEM;
 					goto done;
 				}
 				sc->stats.queueOctets = sc->queueOctets;
 				sc->stats.queueFrames = mbufq_len(&sc->snd_queue);
 				if ((sc->node->nd_flags & NG_SOURCE_ACTIVE)
 				    && !timevalisset(&sc->stats.endTime)) {
 					getmicrotime(&sc->stats.elapsedTime);
 					timevalsub(&sc->stats.elapsedTime,
 					    &sc->stats.startTime);
 				}
 				stats = (struct ng_source_stats *)resp->data;
 				bcopy(&sc->stats, stats, sizeof(* stats));
                         }
                         if (msg->header.cmd != NGM_SOURCE_GET_STATS)
 				bzero(&sc->stats, sizeof(sc->stats));
 		    }
 		    break;
 		case NGM_SOURCE_START:
 		    {
 			uint64_t packets;
 
 			if (msg->header.arglen != sizeof(uint64_t)) {
 				error = EINVAL;
 				break;
 			}
 
 			packets = *(uint64_t *)msg->data;
 
 			error = ng_source_start(sc, packets);
 
 		    	break;
 		    }
 		case NGM_SOURCE_STOP:
 			ng_source_stop(sc);
 			break;
 		case NGM_SOURCE_CLR_DATA:
 			ng_source_clr_data(sc);
 			break;
 		case NGM_SOURCE_SETIFACE:
 		    {
 			char *ifname = (char *)msg->data;
 
 			if (msg->header.arglen < 2) {
 				error = EINVAL;
 				break;
 			}
 
 			ng_source_store_output_ifp(sc, ifname);
 			break;
 		    }
 		case NGM_SOURCE_SETPPS:
 		    {
 			uint32_t pps;
 
 			if (msg->header.arglen != sizeof(uint32_t)) {
 				error = EINVAL;
 				break;
 			}
 
 			pps = *(uint32_t *)msg->data;
 
 			sc->stats.maxPps = pps;
 
 			break;
 		    }
 		case NGM_SOURCE_SET_TIMESTAMP:
 		    {
 			struct ng_source_embed_info *embed;
 
 			if (msg->header.arglen != sizeof(*embed)) {
 				error = EINVAL;
 				goto done;
 			}
 			embed = (struct ng_source_embed_info *)msg->data;
 			bcopy(embed, &sc->embed_timestamp, sizeof(*embed));
 
 			break;
 		    }
 		case NGM_SOURCE_GET_TIMESTAMP:
 		    {
 			struct ng_source_embed_info *embed;
 
 			NG_MKRESPONSE(resp, msg, sizeof(*embed), M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				goto done;
 			}
 			embed = (struct ng_source_embed_info *)resp->data;
 			bcopy(&sc->embed_timestamp, embed, sizeof(*embed));
 
 			break;
 		    }
 		case NGM_SOURCE_SET_COUNTER:
 		    {
 			struct ng_source_embed_cnt_info *embed;
 
 			if (msg->header.arglen != sizeof(*embed)) {
 				error = EINVAL;
 				goto done;
 			}
 			embed = (struct ng_source_embed_cnt_info *)msg->data;
 			if (embed->index >= NG_SOURCE_COUNTERS ||
 			    !(embed->width == 1 || embed->width == 2 ||
 			    embed->width == 4)) {
 				error = EINVAL;
 				goto done;
 			}
 			bcopy(embed, &sc->embed_counter[embed->index],
 			    sizeof(*embed));
 
 			break;
 		    }
 		case NGM_SOURCE_GET_COUNTER:
 		    {
 			uint8_t index = *(uint8_t *)msg->data;
 			struct ng_source_embed_cnt_info *embed;
 
 			if (index >= NG_SOURCE_COUNTERS) {
 				error = EINVAL;
 				goto done;
 			}
 			NG_MKRESPONSE(resp, msg, sizeof(*embed), M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				goto done;
 			}
 			embed = (struct ng_source_embed_cnt_info *)resp->data;
 			bcopy(&sc->embed_counter[index], embed, sizeof(*embed));
 
 			break;
 		    }
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	case NGM_ETHER_COOKIE:
 		if (!(msg->header.flags & NGF_RESP)) {
 			error = EINVAL;
 			break;
 		}
 		switch (msg->header.cmd) {
 		case NGM_ETHER_GET_IFNAME:
 		    {
 			char *ifname = (char *)msg->data;
 
 			if (msg->header.arglen < 2) {
 				error = EINVAL;
 				break;
 			}
 
 			if (ng_source_store_output_ifp(sc, ifname) == 0)
 				ng_source_set_autosrc(sc, 0);
 			break;
 		    }
 		default:
 			error = EINVAL;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 done:
 	/* Take care of synchronous response, if any. */
 	NG_RESPOND_MSG(error, node, item, resp);
 	/* Free the message and return. */
 	NG_FREE_MSG(msg);
 	return (error);
 }
 
 /*
  * Receive data on a hook
  *
  * If data comes in the input hook, enqueue it on the send queue.
  * If data comes in the output hook, discard it.
  */
 static int
 ng_source_rcvdata(hook_p hook, item_p item)
 {
 	sc_p sc = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 	struct mbuf *m;
 	int error = 0;
 
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 
 	/* Which hook? */
 	if (hook == sc->output) {
 		/* discard */
 		NG_FREE_M(m);
 		return (error);
 	}
 	KASSERT(hook == sc->input, ("%s: no hook!", __func__));
 
 	/* Enqueue packet if the queue isn't full. */
 	error = mbufq_enqueue(&sc->snd_queue, m);
 	if (error) {
 		NG_FREE_M(m);
 		return (error);
 	}
 	sc->queueOctets += m->m_pkthdr.len;
 	sc->last_packet = m;
 
 	return (0);
 }
 
 /*
  * Shutdown processing
  */
 static int
 ng_source_rmnode(node_p node)
 {
 	sc_p sc = NG_NODE_PRIVATE(node);
 
 	ng_source_stop(sc);
 	ng_source_clr_data(sc);
 	NG_NODE_SET_PRIVATE(node, NULL);
 	NG_NODE_UNREF(node);
 	free(sc, M_NETGRAPH);
 
 	return (0);
 }
 
 /*
  * Hook disconnection
  */
 static int
 ng_source_disconnect(hook_p hook)
 {
 	sc_p sc;
 
 	sc = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 	KASSERT(sc != NULL, ("%s: null node private", __func__));
 	if (NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0 || hook == sc->output)
 		ng_rmnode_self(NG_HOOK_NODE(hook));
 	return (0);
 }
 
 /*
  * Set sc->output_ifp to point to the struct ifnet of the interface
  * reached via our output hook.
  */
 static int
 ng_source_store_output_ifp(sc_p sc, char *ifname)
 {
 	struct ifnet *ifp;
 
 	ifp = ifunit(ifname);
 
 	if (ifp == NULL) {
 		printf("%s: can't find interface %s\n", __func__, ifname);
 		return (EINVAL);
 	}
 	sc->output_ifp = ifp;
 
 #if 1
 	/* XXX mucking with a drivers ifqueue size is ugly but we need it
 	 * to queue a lot of packets to get close to line rate on a gigabit
 	 * interface with small packets.
 	 * XXX we should restore the original value at stop or disconnect
 	 */
 	if (ifp->if_snd.ifq_maxlen < NG_SOURCE_DRIVER_IFQ_MAXLEN) {
 		printf("ng_source: changing ifq_maxlen from %d to %d\n",
 		    ifp->if_snd.ifq_maxlen, NG_SOURCE_DRIVER_IFQ_MAXLEN);
 		ifp->if_snd.ifq_maxlen = NG_SOURCE_DRIVER_IFQ_MAXLEN;
 	}
 #endif
 	return (0);
 }
 
 /*
  * Set the attached ethernet node's ethernet source address override flag.
  */
 static int
 ng_source_set_autosrc(sc_p sc, uint32_t flag)
 {
 	struct ng_mesg *msg;
 	int error = 0;
 
 	NG_MKMESSAGE(msg, NGM_ETHER_COOKIE, NGM_ETHER_SET_AUTOSRC,
 	    sizeof (uint32_t), M_NOWAIT);
 	if (msg == NULL)
 		return(ENOBUFS);
 
 	*(uint32_t *)msg->data = flag;
 	NG_SEND_MSG_HOOK(error, sc->node, msg, sc->output, 0);
 	return (error);
 }
 
 /*
  * Clear out the data we've queued
  */
 static void
 ng_source_clr_data (sc_p sc)
 {
 	struct mbuf *m;
 
 	for (;;) {
 		m =  mbufq_dequeue(&sc->snd_queue);
 		if (m == NULL)
 			break;
 		NG_FREE_M(m);
 	}
 	sc->queueOctets = 0;
 	sc->last_packet = NULL;
 }
 
 /*
  * Start sending queued data out the output hook
  */
 static int
 ng_source_start(sc_p sc, uint64_t packets)
 {
 	if (sc->output_ifp == NULL && sc->stats.maxPps == 0) {
 		printf("ng_source: start without iface or pps configured\n");
 		return (ENXIO);
 	}
 
 	if (sc->node->nd_flags & NG_SOURCE_ACTIVE)
 		return (EBUSY);
 
 	sc->node->nd_flags |= NG_SOURCE_ACTIVE;
 
 	sc->packets = packets;
 	timevalclear(&sc->stats.elapsedTime);
 	timevalclear(&sc->stats.endTime);
 	getmicrotime(&sc->stats.startTime);
 	getmicrotime(&sc->stats.lastTime);
 	ng_callout(&sc->intr_ch, sc->node, NULL, 0,
 	    ng_source_intr, sc, 0);
 
 	return (0);
 }
 
 /*
  * Stop sending queued data out the output hook
  */
 static void
 ng_source_stop(sc_p sc)
 {
 	ng_uncallout(&sc->intr_ch, sc->node);
 	sc->node->nd_flags &= ~NG_SOURCE_ACTIVE;
 	getmicrotime(&sc->stats.endTime);
 	sc->stats.elapsedTime = sc->stats.endTime;
 	timevalsub(&sc->stats.elapsedTime, &sc->stats.startTime);
 }
 
 /*
  * While active called every NG_SOURCE_INTR_TICKS ticks.
  * Sends as many packets as the interface connected to our
  * output hook is able to enqueue.
  */
 static void
 ng_source_intr(node_p node, hook_p hook, void *arg1, int arg2)
 {
 	sc_p sc = (sc_p)arg1;
 	struct ifqueue *ifq;
 	int packets;
 
 	KASSERT(sc != NULL, ("%s: null node private", __func__));
 
 	if (sc->packets == 0 || sc->output == NULL
 	    || (sc->node->nd_flags & NG_SOURCE_ACTIVE) == 0) {
 		ng_source_stop(sc);
 		return;
 	}
 
 	if (sc->output_ifp != NULL) {
 		ifq = (struct ifqueue *)&sc->output_ifp->if_snd;
 		packets = ifq->ifq_maxlen - ifq->ifq_len;
 	} else
 		packets = mbufq_len(&sc->snd_queue);
 
 	if (sc->stats.maxPps != 0) {
 		struct timeval	now, elapsed;
 		uint64_t	usec;
 		int		maxpkt;
 
 		getmicrotime(&now);
 		elapsed = now;
 		timevalsub(&elapsed, &sc->stats.lastTime);
 		usec = elapsed.tv_sec * 1000000 + elapsed.tv_usec;
 		maxpkt = (uint64_t)sc->stats.maxPps * usec / 1000000;
 		sc->stats.lastTime = now;
 		if (packets > maxpkt)
 			packets = maxpkt;
 	}
 
 	ng_source_send(sc, packets, NULL);
 	if (sc->packets == 0)
 		ng_source_stop(sc);
 	else
 		ng_callout(&sc->intr_ch, node, NULL, NG_SOURCE_INTR_TICKS,
 		    ng_source_intr, sc, 0);
 }
 
 /*
  * Send packets out our output hook.
  */
 static int
 ng_source_send(sc_p sc, int tosend, int *sent_p)
 {
 	struct mbuf *m, *m2;
 	int sent;
 	int error = 0;
 
 	KASSERT(tosend >= 0, ("%s: negative tosend param", __func__));
 	KASSERT(sc->node->nd_flags & NG_SOURCE_ACTIVE,
 	    ("%s: inactive node", __func__));
 
 	if ((uint64_t)tosend > sc->packets)
 		tosend = sc->packets;
 
 	/* Go through the queue sending packets one by one. */
 	for (sent = 0; error == 0 && sent < tosend; ++sent) {
 		m = mbufq_dequeue(&sc->snd_queue);
 		if (m == NULL)
 			break;
 
 		/* Duplicate and modify the packet. */
 		error = ng_source_dup_mod(sc, m, &m2);
 		if (error) {
 			if (error == ENOBUFS)
 				mbufq_prepend(&sc->snd_queue, m);
 			else
 				(void)mbufq_enqueue(&sc->snd_queue, m);
 			break;
 		}
 
 		/*
 		 * Re-enqueue the original packet for us.  The queue
 		 * has a free slot, because we dequeued the packet
 		 * above and this callout function runs under WRITER
 		 * lock.
 		 */
 		error = mbufq_enqueue(&sc->snd_queue, m);
 		KASSERT(error == 0, ("%s: re-enqueue packet failed", __func__));
 
 		sc->stats.outFrames++;
 		sc->stats.outOctets += m2->m_pkthdr.len;
 		NG_SEND_DATA_ONLY(error, sc->output, m2);
 		if (error)
 			break;
 	}
 
 	sc->packets -= sent;
 	if (sent_p != NULL)
 		*sent_p = sent;
 	return (error);
 }
 
 /*
  * Modify packet in 'm' by changing 'len' bytes starting at 'offset'
  * to data in 'cp'.
  *
  * The packet data in 'm' must be in a contiguous buffer in a single mbuf.
  */
 static void
 ng_source_packet_mod(sc_p sc, struct mbuf *m, int offset, int len, caddr_t cp,
     int flags)
 {
 	if (len == 0)
 		return;
 
 	/* Can't modify beyond end of packet. */
 	/* TODO: Pad packet for this case. */
 	if (offset + len > m->m_len)
 		return;
 
 	bcopy(cp, mtod_off(m, offset, caddr_t), len);
 }
 
 static void
 ng_source_mod_counter(sc_p sc, struct ng_source_embed_cnt_info *cnt,
     struct mbuf *m, int increment)
 {
 	caddr_t cp;
 	uint32_t val;
 
 	val = htonl(cnt->next_val);
 	cp = (caddr_t)&val + sizeof(val) - cnt->width;
 	ng_source_packet_mod(sc, m, cnt->offset, cnt->width, cp, cnt->flags);
 
 	if (increment) {
 		cnt->next_val += increment;
 
 		if (increment > 0 && cnt->next_val > cnt->max_val) {
 			cnt->next_val = cnt->min_val - 1 +
 			    (cnt->next_val - cnt->max_val);
 			if (cnt->next_val > cnt->max_val)
 				cnt->next_val = cnt->max_val;
 		} else if (increment < 0 && cnt->next_val < cnt->min_val) {
 			cnt->next_val = cnt->max_val + 1 +
 			    (cnt->next_val - cnt->min_val);
 			if (cnt->next_val < cnt->min_val)
 				cnt->next_val = cnt->max_val;
 		}
 	}
 }
 
 static int
 ng_source_dup_mod(sc_p sc, struct mbuf *m0, struct mbuf **m_ptr)
 {
 	struct mbuf *m;
 	struct ng_source_embed_cnt_info *cnt;
 	struct ng_source_embed_info *ts;
 	int modify;
 	int error = 0;
 	int i, increment;
 
 	/* Are we going to modify packets? */
 	modify = sc->embed_timestamp.flags & NGM_SOURCE_EMBED_ENABLE;
 	for (i = 0; !modify && i < NG_SOURCE_COUNTERS; ++i)
 		modify = sc->embed_counter[i].flags & NGM_SOURCE_EMBED_ENABLE;
 
 	/* Duplicate the packet. */
 	if (modify)
 		m = m_dup(m0, M_NOWAIT);
 	else
 		m = m_copypacket(m0, M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto done;
 	}
 	*m_ptr = m;
 
 	if (!modify)
 		goto done;
 
 	/* Modify the copied packet for sending. */
 	KASSERT(M_WRITABLE(m), ("%s: packet not writable", __func__));
 
 	for (i = 0; i < NG_SOURCE_COUNTERS; ++i) {
 		cnt = &sc->embed_counter[i];
 		if (cnt->flags & NGM_SOURCE_EMBED_ENABLE) {
 			if ((cnt->flags & NGM_SOURCE_INC_CNT_PER_LIST) == 0 ||
 			    sc->last_packet == m0)
 				increment = cnt->increment;
 			else
 				increment = 0;
 			ng_source_mod_counter(sc, cnt, m, increment);
 		}
 	}
 
 	ts = &sc->embed_timestamp;
 	if (ts->flags & NGM_SOURCE_EMBED_ENABLE) {
 		struct timeval now;
 		getmicrotime(&now);
 		now.tv_sec = htonl(now.tv_sec);
 		now.tv_usec = htonl(now.tv_usec);
 		ng_source_packet_mod(sc, m, ts->offset, sizeof (now),
 		    (caddr_t)&now, ts->flags);
 	}
 
 done:
 	return(error);
 }
diff --git a/sys/netinet/if_ether.c b/sys/netinet/if_ether.c
index 1c6c35f180aa..711f2ce153a8 100644
--- a/sys/netinet/if_ether.c
+++ b/sys/netinet/if_ether.c
@@ -1,1523 +1,1524 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if_ether.c	8.1 (Berkeley) 6/10/93
  */
 
 /*
  * Ethernet address resolution protocol.
  * TODO:
  *	add "inuse/lock" bit (or ref. count) along with valid bit
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/ethernet.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #ifdef INET
 #include <netinet/ip_carp.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #define SIN(s) ((const struct sockaddr_in *)(s))
 
 static struct timeval arp_lastlog;
 static int arp_curpps;
 static int arp_maxpps = 1;
 
 /* Simple ARP state machine */
 enum arp_llinfo_state {
 	ARP_LLINFO_INCOMPLETE = 0, /* No LLE data */
 	ARP_LLINFO_REACHABLE,	/* LLE is valid */
 	ARP_LLINFO_VERIFY,	/* LLE is valid, need refresh */
 	ARP_LLINFO_DELETED,	/* LLE is deleted */
 };
 
 SYSCTL_DECL(_net_link_ether);
 static SYSCTL_NODE(_net_link_ether, PF_INET, inet,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "");
 static SYSCTL_NODE(_net_link_ether, PF_ARP, arp,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "");
 
 /* timer values */
 VNET_DEFINE_STATIC(int, arpt_keep) = (20*60);	/* once resolved, good for 20
 						 * minutes */
 VNET_DEFINE_STATIC(int, arp_maxtries) = 5;
 VNET_DEFINE_STATIC(int, arp_proxyall) = 0;
 VNET_DEFINE_STATIC(int, arpt_down) = 20;	/* keep incomplete entries for
 						 * 20 seconds */
 VNET_DEFINE_STATIC(int, arpt_rexmit) = 1;	/* retransmit arp entries, sec*/
 VNET_PCPUSTAT_DEFINE(struct arpstat, arpstat);  /* ARP statistics, see if_arp.h */
 VNET_PCPUSTAT_SYSINIT(arpstat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(arpstat);
 #endif /* VIMAGE */
 
 VNET_DEFINE_STATIC(int, arp_maxhold) = 16;
 
 #define	V_arpt_keep		VNET(arpt_keep)
 #define	V_arpt_down		VNET(arpt_down)
 #define	V_arpt_rexmit		VNET(arpt_rexmit)
 #define	V_arp_maxtries		VNET(arp_maxtries)
 #define	V_arp_proxyall		VNET(arp_proxyall)
 #define	V_arp_maxhold		VNET(arp_maxhold)
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_age, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_keep), 0,
 	"ARP entry lifetime in seconds");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxtries, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxtries), 0,
 	"ARP resolution attempts before returning error");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, proxyall, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_proxyall), 0,
 	"Enable proxy ARP for all suitable requests");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, wait, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arpt_down), 0,
 	"Incomplete ARP entry lifetime in seconds");
 SYSCTL_VNET_PCPUSTAT(_net_link_ether_arp, OID_AUTO, stats, struct arpstat,
     arpstat, "ARP statistics (struct arpstat, net/if_arp.h)");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, maxhold, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_maxhold), 0,
 	"Number of packets to hold per ARP entry");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, max_log_per_second,
 	CTLFLAG_RW, &arp_maxpps, 0,
 	"Maximum number of remotely triggered ARP messages that can be "
 	"logged per second");
 
 /*
  * Due to the exponential backoff algorithm used for the interval between GARP
  * retransmissions, the maximum number of retransmissions is limited for
  * sanity. This limit corresponds to a maximum interval between retransmissions
  * of 2^16 seconds ~= 18 hours.
  *
  * Making this limit more dynamic is more complicated than worthwhile,
  * especially since sending out GARPs spaced days apart would be of little
  * use. A maximum dynamic limit would look something like:
  *
  * const int max = fls(INT_MAX / hz) - 1;
  */
 #define MAX_GARP_RETRANSMITS 16
 static int sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS);
 static int garp_rexmit_count = 0; /* GARP retransmission setting. */
 
 SYSCTL_PROC(_net_link_ether_inet, OID_AUTO, garp_rexmit_count,
     CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_MPSAFE,
     &garp_rexmit_count, 0, sysctl_garp_rexmit, "I",
     "Number of times to retransmit GARP packets;"
     " 0 to disable, maximum of 16");
 
 VNET_DEFINE_STATIC(int, arp_log_level) = LOG_INFO;	/* Min. log(9) level. */
 #define	V_arp_log_level		VNET(arp_log_level)
 SYSCTL_INT(_net_link_ether_arp, OID_AUTO, log_level, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(arp_log_level), 0,
 	"Minimum log(9) level for recording rate limited arp log messages. "
 	"The higher will be log more (emerg=0, info=6 (default), debug=7).");
 #define	ARP_LOG(pri, ...)	do {					\
 	if ((pri) <= V_arp_log_level &&					\
 	    ppsratecheck(&arp_lastlog, &arp_curpps, arp_maxpps))	\
 		log((pri), "arp: " __VA_ARGS__);			\
 } while (0)
 
 static void	arpintr(struct mbuf *);
 static void	arptimer(void *);
 #ifdef INET
 static void	in_arpinput(struct mbuf *);
 #endif
 
 static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr,
     struct ifnet *ifp, int bridged, struct llentry *la);
 static void arp_mark_lle_reachable(struct llentry *la, struct ifnet *ifp);
 static void arp_iflladdr(void *arg __unused, struct ifnet *ifp);
 
 static eventhandler_tag iflladdr_tag;
 
 static const struct netisr_handler arp_nh = {
 	.nh_name = "arp",
 	.nh_handler = arpintr,
 	.nh_proto = NETISR_ARP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 /*
  * Timeout routine.  Age arp_tab entries periodically.
  */
 static void
 arptimer(void *arg)
 {
 	struct llentry *lle = (struct llentry *)arg;
 	struct ifnet *ifp;
 
 	if (lle->la_flags & LLE_STATIC) {
 		return;
 	}
 	LLE_WLOCK(lle);
 	if (callout_pending(&lle->lle_timer)) {
 		/*
 		 * Here we are a bit odd here in the treatment of
 		 * active/pending. If the pending bit is set, it got
 		 * rescheduled before I ran. The active
 		 * bit we ignore, since if it was stopped
 		 * in ll_tablefree() and was currently running
 		 * it would have return 0 so the code would
 		 * not have deleted it since the callout could
 		 * not be stopped so we want to go through
 		 * with the delete here now. If the callout
 		 * was restarted, the pending bit will be back on and
 		 * we just want to bail since the callout_reset would
 		 * return 1 and our reference would have been removed
 		 * by arpresolve() below.
 		 */
 		LLE_WUNLOCK(lle);
  		return;
  	}
 	ifp = lle->lle_tbl->llt_ifp;
 	CURVNET_SET(ifp->if_vnet);
 
 	switch (lle->ln_state) {
 	case ARP_LLINFO_REACHABLE:
 
 		/*
 		 * Expiration time is approaching.
 		 * Request usage feedback from the datapath.
 		 * Change state and re-schedule ourselves.
 		 */
 		llentry_request_feedback(lle);
 		lle->ln_state = ARP_LLINFO_VERIFY;
 		callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 		LLE_WUNLOCK(lle);
 		CURVNET_RESTORE();
 		return;
 	case ARP_LLINFO_VERIFY:
 		if (llentry_get_hittime(lle) > 0 && lle->la_preempt > 0) {
 			/* Entry was used, issue refresh request */
 			struct epoch_tracker et;
 			struct in_addr dst;
 
 			dst = lle->r_l3addr.addr4;
 			lle->la_preempt--;
 			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 			LLE_WUNLOCK(lle);
 			NET_EPOCH_ENTER(et);
 			arprequest(ifp, NULL, &dst, NULL);
 			NET_EPOCH_EXIT(et);
 			CURVNET_RESTORE();
 			return;
 		}
 		/* Nothing happened. Reschedule if not too late */
 		if (lle->la_expire > time_uptime) {
 			callout_schedule(&lle->lle_timer, hz * V_arpt_rexmit);
 			LLE_WUNLOCK(lle);
 			CURVNET_RESTORE();
 			return;
 		}
 		break;
 	case ARP_LLINFO_INCOMPLETE:
 	case ARP_LLINFO_DELETED:
 		break;
 	}
 
 	if ((lle->la_flags & LLE_DELETED) == 0) {
 		int evt;
 
 		if (lle->la_flags & LLE_VALID)
 			evt = LLENTRY_EXPIRED;
 		else
 			evt = LLENTRY_TIMEDOUT;
 		EVENTHANDLER_INVOKE(lle_event, lle, evt);
 	}
 
 	callout_stop(&lle->lle_timer);
 
 	/* XXX: LOR avoidance. We still have ref on lle. */
 	LLE_WUNLOCK(lle);
 	IF_AFDATA_LOCK(ifp);
 	LLE_WLOCK(lle);
 
 	/* Guard against race with other llentry_free(). */
 	if (lle->la_flags & LLE_LINKED) {
 		LLE_REMREF(lle);
 		lltable_unlink_entry(lle->lle_tbl, lle);
 	}
 	IF_AFDATA_UNLOCK(ifp);
 
 	size_t pkts_dropped = llentry_free(lle);
 
 	ARPSTAT_ADD(dropped, pkts_dropped);
 	ARPSTAT_INC(timeouts);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Stores link-layer header for @ifp in format suitable for if_output()
  * into buffer @buf. Resulting header length is stored in @bufsize.
  *
  * Returns 0 on success.
  */
 static int
 arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf,
     size_t *bufsize)
 {
 	struct if_encap_req ereq;
 	int error;
 
 	bzero(buf, *bufsize);
 	bzero(&ereq, sizeof(ereq));
 	ereq.buf = buf;
 	ereq.bufsize = *bufsize;
 	ereq.rtype = IFENCAP_LL;
 	ereq.family = AF_ARP;
 	ereq.lladdr = ar_tha(ah);
 	ereq.hdata = (u_char *)ah;
 	if (bcast)
 		ereq.flags = IFENCAP_FLAG_BROADCAST;
 	error = ifp->if_requestencap(ifp, &ereq);
 	if (error == 0)
 		*bufsize = ereq.bufsize;
 
 	return (error);
 }
 
 /*
  * Broadcast an ARP request. Caller specifies:
  *	- arp header source ip address
  *	- arp header target ip address
  *	- arp header source ethernet address
  */
 static int
 arprequest_internal(struct ifnet *ifp, const struct in_addr *sip,
     const struct in_addr *tip, u_char *enaddr)
 {
 	struct mbuf *m;
 	struct arphdr *ah;
 	struct sockaddr sa;
 	u_char *carpaddr = NULL;
 	uint8_t linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	struct route ro;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	if (sip == NULL) {
 		/*
 		 * The caller did not supply a source address, try to find
 		 * a compatible one among those assigned to this interface.
 		 */
 		struct ifaddr *ifa;
 
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 
 			if (ifa->ifa_carp) {
 				if ((*carp_iamatch_p)(ifa, &carpaddr) == 0)
 					continue;
 				sip = &IA_SIN(ifa)->sin_addr;
 			} else {
 				carpaddr = NULL;
 				sip = &IA_SIN(ifa)->sin_addr;
 			}
 
 			if (0 == ((sip->s_addr ^ tip->s_addr) &
 			    IA_MASKSIN(ifa)->sin_addr.s_addr))
 				break;  /* found it. */
 		}
 		if (sip == NULL) {
 			printf("%s: cannot find matching address\n", __func__);
 			return (EADDRNOTAVAIL);
 		}
 	}
 	if (enaddr == NULL)
 		enaddr = carpaddr ? carpaddr : (u_char *)IF_LLADDR(ifp);
 
 	if ((m = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
 		return (ENOMEM);
 	m->m_len = sizeof(*ah) + 2 * sizeof(struct in_addr) +
 		2 * ifp->if_addrlen;
 	m->m_pkthdr.len = m->m_len;
 	M_ALIGN(m, m->m_len);
 	ah = mtod(m, struct arphdr *);
 	bzero((caddr_t)ah, m->m_len);
 #ifdef MAC
 	mac_netinet_arp_send(ifp, m);
 #endif
 	ah->ar_pro = htons(ETHERTYPE_IP);
 	ah->ar_hln = ifp->if_addrlen;		/* hardware address length */
 	ah->ar_pln = sizeof(struct in_addr);	/* protocol address length */
 	ah->ar_op = htons(ARPOP_REQUEST);
 	bcopy(enaddr, ar_sha(ah), ah->ar_hln);
 	bcopy(sip, ar_spa(ah), ah->ar_pln);
 	bcopy(tip, ar_tpa(ah), ah->ar_pln);
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 
 	/* Calculate link header for sending frame */
 	bzero(&ro, sizeof(ro));
 	linkhdrsize = sizeof(linkhdr);
 	error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize);
 	if (error != 0 && error != EAFNOSUPPORT) {
 		m_freem(m);
 		ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
 		    if_name(ifp), error);
 		return (error);
 	}
 
 	ro.ro_prepend = linkhdr;
 	ro.ro_plen = linkhdrsize;
 	ro.ro_flags = 0;
 
 	m->m_flags |= M_BCAST;
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	error = (*ifp->if_output)(ifp, m, &sa, &ro);
 	ARPSTAT_INC(txrequests);
 	if (error) {
 		ARPSTAT_INC(txerrors);
 		ARP_LOG(LOG_DEBUG, "Failed to send ARP packet on %s: %d\n",
 		    if_name(ifp), error);
 	}
 	return (error);
 }
 
 void
 arprequest(struct ifnet *ifp, const struct in_addr *sip,
     const struct in_addr *tip, u_char *enaddr)
 {
 
 	(void) arprequest_internal(ifp, sip, tip, enaddr);
 }
 
 /*
  * Resolve an IP address into an ethernet address - heavy version.
  * Used internally by arpresolve().
  * We have already checked that we can't use an existing lle without
  * modification so we have to acquire an LLE_EXCLUSIVE lle lock.
  *
  * On success, desten and pflags are filled in and the function returns 0;
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
 static int
 arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, uint32_t *pflags,
 	struct llentry **plle)
 {
 	struct llentry *la = NULL, *la_tmp;
 	int error, renew;
 	char *lladdr;
 	int ll_len;
 
 	NET_EPOCH_ASSERT();
 
 	if (pflags != NULL)
 		*pflags = 0;
 	if (plle != NULL)
 		*plle = NULL;
 
 	if ((flags & LLE_CREATE) == 0)
 		la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 	if (la == NULL && (ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) == 0) {
 		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
 		if (la == NULL) {
 			char addrbuf[INET_ADDRSTRLEN];
 
 			log(LOG_DEBUG,
 			    "arpresolve: can't allocate llinfo for %s on %s\n",
 			    inet_ntoa_r(SIN(dst)->sin_addr, addrbuf),
 			    if_name(ifp));
 			m_freem(m);
 			return (EINVAL);
 		}
 
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(la);
 		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 		/* Prefer ANY existing lle over newly-created one */
 		if (la_tmp == NULL)
 			lltable_link_entry(LLTABLE(ifp), la);
 		IF_AFDATA_WUNLOCK(ifp);
 		if (la_tmp != NULL) {
 			lltable_free_entry(LLTABLE(ifp), la);
 			la = la_tmp;
 		}
 	}
 	if (la == NULL) {
 		m_freem(m);
 		return (EINVAL);
 	}
 
 	if ((la->la_flags & LLE_VALID) &&
 	    ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) {
 		if (flags & LLE_ADDRONLY) {
 			lladdr = la->ll_addr;
 			ll_len = ifp->if_addrlen;
 		} else {
 			lladdr = la->r_linkdata;
 			ll_len = la->r_hdrlen;
 		}
 		bcopy(lladdr, desten, ll_len);
 
 		/* Notify LLE code that the entry was used by datapath */
 		llentry_provide_feedback(la);
 		if (pflags != NULL)
 			*pflags = la->la_flags & (LLE_VALID|LLE_IFADDR);
 		if (plle) {
 			LLE_ADDREF(la);
 			*plle = la;
 		}
 		LLE_WUNLOCK(la);
 		return (0);
 	}
 
 	renew = (la->la_asked == 0 || la->la_expire != time_uptime);
 
 	/*
 	 * There is an arptab entry, but no ethernet address
 	 * response yet.  Add the mbuf to the list, dropping
 	 * the oldest packet if we have exceeded the system
 	 * setting.
 	 */
 	if (m != NULL) {
 		size_t dropped = lltable_append_entry_queue(la, m, V_arp_maxhold);
 		ARPSTAT_ADD(dropped, dropped);
 	}
 
 	/*
 	 * Return EWOULDBLOCK if we have tried less than arp_maxtries. It
 	 * will be masked by ether_output(). Return EHOSTDOWN/EHOSTUNREACH
 	 * if we have already sent arp_maxtries ARP requests. Retransmit the
 	 * ARP request, but not faster than one request per second.
 	 */
 	if (la->la_asked < V_arp_maxtries)
 		error = EWOULDBLOCK;	/* First request. */
 	else
 		error = is_gw != 0 ? EHOSTUNREACH : EHOSTDOWN;
 
 	if (renew) {
 		int canceled, e;
 
 		LLE_ADDREF(la);
 		la->la_expire = time_uptime;
 		canceled = callout_reset(&la->lle_timer, hz * V_arpt_down,
 		    arptimer, la);
 		if (canceled)
 			LLE_REMREF(la);
 		la->la_asked++;
 		LLE_WUNLOCK(la);
 		e = arprequest_internal(ifp, NULL, &SIN(dst)->sin_addr, NULL);
 		/*
 		 * Only overwrite 'error' in case of error; in case of success
 		 * the proper return value was already set above.
 		 */
 		if (e != 0)
 			return (e);
 		return (error);
 	}
 
 	LLE_WUNLOCK(la);
 	return (error);
 }
 
 /*
  * Lookups link header based on an IP address.
  * On input:
  *    ifp is the interface we use
  *    is_gw != 0 if @dst represents gateway to some destination
  *    m is the mbuf. May be NULL if we don't have a packet.
  *    dst is the next hop,
  *    desten is the storage to put LL header.
  *    flags returns subset of lle flags: LLE_VALID | LLE_IFADDR
  *
  * On success, full/partial link header and flags are filled in and
  * the function returns 0.
  * If the packet must be held pending resolution, we return EWOULDBLOCK
  * On other errors, we return the corresponding error code.
  * Note that m_freem() handles NULL.
  */
 int
 arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m,
 	const struct sockaddr *dst, u_char *desten, uint32_t *pflags,
 	struct llentry **plle)
 {
 	struct llentry *la = NULL;
 
 	NET_EPOCH_ASSERT();
 
 	if (pflags != NULL)
 		*pflags = 0;
 	if (plle != NULL)
 		*plle = NULL;
 
 	if (m != NULL) {
 		if (m->m_flags & M_BCAST) {
 			/* broadcast */
 			(void)memcpy(desten,
 			    ifp->if_broadcastaddr, ifp->if_addrlen);
 			return (0);
 		}
 		if (m->m_flags & M_MCAST) {
 			/* multicast */
 			ETHER_MAP_IP_MULTICAST(&SIN(dst)->sin_addr, desten);
 			return (0);
 		}
 	}
 
 	la = lla_lookup(LLTABLE(ifp), plle ? LLE_EXCLUSIVE : LLE_UNLOCKED, dst);
 	if (la != NULL && (la->r_flags & RLLE_VALID) != 0) {
 		/* Entry found, let's copy lle info */
 		bcopy(la->r_linkdata, desten, la->r_hdrlen);
 		if (pflags != NULL)
 			*pflags = LLE_VALID | (la->r_flags & RLLE_IFADDR);
 		/* Notify the LLE handling code that the entry was used. */
 		llentry_provide_feedback(la);
 		if (plle) {
 			LLE_ADDREF(la);
 			*plle = la;
 			LLE_WUNLOCK(la);
 		}
 		return (0);
 	}
 	if (plle && la)
 		LLE_WUNLOCK(la);
 
 	return (arpresolve_full(ifp, is_gw, la == NULL ? LLE_CREATE : 0, m, dst,
 	    desten, pflags, plle));
 }
 
 /*
  * Common length and type checks are done here,
  * then the protocol-specific routine is called.
  */
 static void
 arpintr(struct mbuf *m)
 {
 	struct arphdr *ar;
 	struct ifnet *ifp;
 	char *layer;
 	int hlen;
 
 	ifp = m->m_pkthdr.rcvif;
 
 	if (m->m_len < sizeof(struct arphdr) &&
 	    ((m = m_pullup(m, sizeof(struct arphdr))) == NULL)) {
 		ARP_LOG(LOG_NOTICE, "packet with short header received on %s\n",
 		    if_name(ifp));
 		return;
 	}
 	ar = mtod(m, struct arphdr *);
 
 	/* Check if length is sufficient */
 	if (m->m_len <  arphdr_len(ar)) {
 		m = m_pullup(m, arphdr_len(ar));
 		if (m == NULL) {
 			ARP_LOG(LOG_NOTICE, "short packet received on %s\n",
 			    if_name(ifp));
 			return;
 		}
 		ar = mtod(m, struct arphdr *);
 	}
 
 	hlen = 0;
 	layer = "";
 	switch (ntohs(ar->ar_hrd)) {
 	case ARPHRD_ETHER:
 		hlen = ETHER_ADDR_LEN; /* RFC 826 */
 		layer = "ethernet";
 		break;
 	case ARPHRD_INFINIBAND:
 		hlen = 20;	/* RFC 4391, INFINIBAND_ALEN */
 		layer = "infiniband";
 		break;
 	case ARPHRD_IEEE1394:
 		hlen = 0; /* SHALL be 16 */ /* RFC 2734 */
 		layer = "firewire";
 
 		/*
 		 * Restrict too long hardware addresses.
 		 * Currently we are capable of handling 20-byte
 		 * addresses ( sizeof(lle->ll_addr) )
 		 */
 		if (ar->ar_hln >= 20)
 			hlen = 16;
 		break;
 	default:
 		ARP_LOG(LOG_NOTICE,
 		    "packet with unknown hardware format 0x%02d received on "
 		    "%s\n", ntohs(ar->ar_hrd), if_name(ifp));
 		m_freem(m);
 		return;
 	}
 
 	if (hlen != 0 && hlen != ar->ar_hln) {
 		ARP_LOG(LOG_NOTICE,
 		    "packet with invalid %s address length %d received on %s\n",
 		    layer, ar->ar_hln, if_name(ifp));
 		m_freem(m);
 		return;
 	}
 
 	ARPSTAT_INC(received);
 	switch (ntohs(ar->ar_pro)) {
 #ifdef INET
 	case ETHERTYPE_IP:
 		in_arpinput(m);
 		return;
 #endif
 	}
 	m_freem(m);
 }
 
 #ifdef INET
 /*
  * ARP for Internet protocols on 10 Mb/s Ethernet.
  * Algorithm is that given in RFC 826.
  * In addition, a sanity check is performed on the sender
  * protocol address, to catch impersonators.
  * We no longer handle negotiations for use of trailer protocol:
  * Formerly, ARP replied for protocol type ETHERTYPE_TRAIL sent
  * along with IP replies if we wanted trailers sent to us,
  * and also sent them in response to IP replies.
  * This allowed either end to announce the desire to receive
  * trailer packets.
  * We no longer reply to requests for ETHERTYPE_TRAIL protocol either,
  * but formerly didn't normally send requests.
  */
 static int log_arp_wrong_iface = 1;
 static int log_arp_movements = 1;
 static int log_arp_permanent_modify = 1;
 static int allow_multicast = 0;
 
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_wrong_iface, CTLFLAG_RW,
 	&log_arp_wrong_iface, 0,
 	"log arp packets arriving on the wrong interface");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_movements, CTLFLAG_RW,
 	&log_arp_movements, 0,
 	"log arp replies from MACs different than the one in the cache");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, log_arp_permanent_modify, CTLFLAG_RW,
 	&log_arp_permanent_modify, 0,
 	"log arp replies from MACs different than the one in the permanent arp entry");
 SYSCTL_INT(_net_link_ether_inet, OID_AUTO, allow_multicast, CTLFLAG_RW,
 	&allow_multicast, 0, "accept multicast addresses");
 
 static void
 in_arpinput(struct mbuf *m)
 {
 	struct arphdr *ah;
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct llentry *la = NULL, *la_tmp;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	struct sockaddr sa;
 	struct in_addr isaddr, itaddr, myaddr;
 	u_int8_t *enaddr = NULL;
 	int op;
 	int bridged = 0, is_bridge = 0;
 	int carped;
 	struct sockaddr_in sin;
 	struct sockaddr *dst;
 	struct nhop_object *nh;
 	uint8_t linkhdr[LLE_MAX_LINKHDR];
 	struct route ro;
 	size_t linkhdrsize;
 	int lladdr_off;
 	int error;
 	char addrbuf[INET_ADDRSTRLEN];
 
 	NET_EPOCH_ASSERT();
 
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = 0;
 
 	if (ifp->if_bridge)
 		bridged = 1;
 	if (ifp->if_type == IFT_BRIDGE)
 		is_bridge = 1;
 
 	/*
 	 * We already have checked that mbuf contains enough contiguous data
 	 * to hold entire arp message according to the arp header.
 	 */
 	ah = mtod(m, struct arphdr *);
 
 	/*
 	 * ARP is only for IPv4 so we can reject packets with
 	 * a protocol length not equal to an IPv4 address.
 	 */
 	if (ah->ar_pln != sizeof(struct in_addr)) {
 		ARP_LOG(LOG_NOTICE, "requested protocol length != %zu\n",
 		    sizeof(struct in_addr));
 		goto drop;
 	}
 
 	if (allow_multicast == 0 && ETHER_IS_MULTICAST(ar_sha(ah))) {
 		ARP_LOG(LOG_NOTICE, "%*D is multicast\n",
 		    ifp->if_addrlen, (u_char *)ar_sha(ah), ":");
 		goto drop;
 	}
 
 	op = ntohs(ah->ar_op);
 	(void)memcpy(&isaddr, ar_spa(ah), sizeof (isaddr));
 	(void)memcpy(&itaddr, ar_tpa(ah), sizeof (itaddr));
 
 	if (op == ARPOP_REPLY)
 		ARPSTAT_INC(rxreplies);
 
 	/*
 	 * For a bridge, we want to check the address irrespective
 	 * of the receive interface. (This will change slightly
 	 * when we have clusters of interfaces).
 	 */
 	CK_LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    itaddr.s_addr == ia->ia_addr.sin_addr.s_addr &&
 		    (ia->ia_ifa.ifa_carp == NULL ||
 		    (*carp_iamatch_p)(&ia->ia_ifa, &enaddr))) {
 			ifa_ref(&ia->ia_ifa);
 			goto match;
 		}
 	}
 	CK_LIST_FOREACH(ia, INADDR_HASH(isaddr.s_addr), ia_hash)
 		if (((bridged && ia->ia_ifp->if_bridge == ifp->if_bridge) ||
 		    ia->ia_ifp == ifp) &&
 		    isaddr.s_addr == ia->ia_addr.sin_addr.s_addr) {
 			ifa_ref(&ia->ia_ifa);
 			goto match;
 		}
 
 #define BDG_MEMBER_MATCHES_ARP(addr, ifp, ia)				\
   (ia->ia_ifp->if_bridge == ifp->if_softc &&				\
   !bcmp(IF_LLADDR(ia->ia_ifp), IF_LLADDR(ifp), ifp->if_addrlen) &&	\
   addr == ia->ia_addr.sin_addr.s_addr)
 	/*
 	 * Check the case when bridge shares its MAC address with
 	 * some of its children, so packets are claimed by bridge
 	 * itself (bridge_input() does it first), but they are really
 	 * meant to be destined to the bridge member.
 	 */
 	if (is_bridge) {
 		CK_LIST_FOREACH(ia, INADDR_HASH(itaddr.s_addr), ia_hash) {
 			if (BDG_MEMBER_MATCHES_ARP(itaddr.s_addr, ifp, ia)) {
 				ifa_ref(&ia->ia_ifa);
 				ifp = ia->ia_ifp;
 				goto match;
 			}
 		}
 	}
 #undef BDG_MEMBER_MATCHES_ARP
 
 	/*
 	 * No match, use the first inet address on the receive interface
 	 * as a dummy address for the rest of the function.
 	 */
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_INET &&
 		    (ifa->ifa_carp == NULL ||
 		    (*carp_iamatch_p)(ifa, &enaddr))) {
 			ia = ifatoia(ifa);
 			ifa_ref(ifa);
 			goto match;
 		}
 
 	/*
 	 * If bridging, fall back to using any inet address.
 	 */
 	if (!bridged || (ia = CK_STAILQ_FIRST(&V_in_ifaddrhead)) == NULL)
 		goto drop;
 	ifa_ref(&ia->ia_ifa);
 match:
 	if (!enaddr)
 		enaddr = (u_int8_t *)IF_LLADDR(ifp);
 	carped = (ia->ia_ifa.ifa_carp != NULL);
 	myaddr = ia->ia_addr.sin_addr;
 	ifa_free(&ia->ia_ifa);
 	if (!bcmp(ar_sha(ah), enaddr, ifp->if_addrlen))
 		goto drop;	/* it's from me, ignore it. */
 	if (!bcmp(ar_sha(ah), ifp->if_broadcastaddr, ifp->if_addrlen)) {
 		ARP_LOG(LOG_NOTICE, "link address is broadcast for IP address "
 		    "%s!\n", inet_ntoa_r(isaddr, addrbuf));
 		goto drop;
 	}
 
 	if (ifp->if_addrlen != ah->ar_hln) {
 		ARP_LOG(LOG_WARNING, "from %*D: addr len: new %d, "
 		    "i/f %d (ignored)\n", ifp->if_addrlen,
 		    (u_char *) ar_sha(ah), ":", ah->ar_hln,
 		    ifp->if_addrlen);
 		goto drop;
 	}
 
 	/*
 	 * Warn if another host is using the same IP address, but only if the
 	 * IP address isn't 0.0.0.0, which is used for DHCP only, in which
 	 * case we suppress the warning to avoid false positive complaints of
 	 * potential misconfiguration.
 	 */
 	if (!bridged && !carped && isaddr.s_addr == myaddr.s_addr &&
 	    myaddr.s_addr != 0) {
 		ARP_LOG(LOG_ERR, "%*D is using my IP address %s on %s!\n",
 		   ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 		   inet_ntoa_r(isaddr, addrbuf), ifp->if_xname);
 		itaddr = myaddr;
 		ARPSTAT_INC(dupips);
 		goto reply;
 	}
 	if (ifp->if_flags & IFF_STATICARP)
 		goto reply;
 
 	bzero(&sin, sizeof(sin));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_addr = isaddr;
 	dst = (struct sockaddr *)&sin;
 	la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 	if (la != NULL)
 		arp_check_update_lle(ah, isaddr, ifp, bridged, la);
 	else if (itaddr.s_addr == myaddr.s_addr) {
 		/*
 		 * Request/reply to our address, but no lle exists yet.
 		 * Calculate full link prepend to use in lle.
 		 */
 		linkhdrsize = sizeof(linkhdr);
 		if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
 		    &linkhdrsize, &lladdr_off) != 0)
 			goto reply;
 
 		/* Allocate new entry */
 		la = lltable_alloc_entry(LLTABLE(ifp), 0, dst);
 		if (la == NULL) {
 			/*
 			 * lle creation may fail if source address belongs
 			 * to non-directly connected subnet. However, we
 			 * will try to answer the request instead of dropping
 			 * frame.
 			 */
 			goto reply;
 		}
 		lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
 		    lladdr_off);
 
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(la);
 		la_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 
 		/*
 		 * Check if lle still does not exists.
 		 * If it does, that means that we either
 		 * 1) have configured it explicitly, via
 		 * 1a) 'arp -s' static entry or
 		 * 1b) interface address static record
 		 * or
 		 * 2) it was the result of sending first packet to-host
 		 * or
 		 * 3) it was another arp reply packet we handled in
 		 * different thread.
 		 *
 		 * In all cases except 3) we definitely need to prefer
 		 * existing lle. For the sake of simplicity, prefer any
 		 * existing lle over newly-create one.
 		 */
 		if (la_tmp == NULL)
 			lltable_link_entry(LLTABLE(ifp), la);
 		IF_AFDATA_WUNLOCK(ifp);
 
 		if (la_tmp == NULL) {
 			arp_mark_lle_reachable(la, ifp);
 			LLE_WUNLOCK(la);
 		} else {
 			/* Free newly-create entry and handle packet */
 			lltable_free_entry(LLTABLE(ifp), la);
 			la = la_tmp;
 			la_tmp = NULL;
 			arp_check_update_lle(ah, isaddr, ifp, bridged, la);
 			/* arp_check_update_lle() returns @la unlocked */
 		}
 		la = NULL;
 	}
 reply:
 	if (op != ARPOP_REQUEST)
 		goto drop;
 	ARPSTAT_INC(rxrequests);
 
 	if (itaddr.s_addr == myaddr.s_addr) {
 		/* Shortcut.. the receiving interface is the target. */
 		(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 		(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 	} else {
 		/*
 		 * Destination address is not ours. Check if
 		 * proxyarp entry exists or proxyarp is turned on globally.
 		 */
 		struct llentry *lle;
 
 		sin.sin_addr = itaddr;
 		lle = lla_lookup(LLTABLE(ifp), 0, (struct sockaddr *)&sin);
 
 		if ((lle != NULL) && (lle->la_flags & LLE_PUB)) {
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln);
 			LLE_RUNLOCK(lle);
 		} else {
 			if (lle != NULL)
 				LLE_RUNLOCK(lle);
 
 			if (!V_arp_proxyall)
 				goto drop;
 
 			NET_EPOCH_ASSERT();
 			nh = fib4_lookup(ifp->if_fib, itaddr, 0, 0, 0);
 			if (nh == NULL)
 				goto drop;
 
 			/*
 			 * Don't send proxies for nodes on the same interface
 			 * as this one came out of, or we'll get into a fight
 			 * over who claims what Ether address.
 			 */
 			if (nh->nh_ifp == ifp)
 				goto drop;
 
 			(void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln);
 			(void)memcpy(ar_sha(ah), enaddr, ah->ar_hln);
 
 			/*
 			 * Also check that the node which sent the ARP packet
 			 * is on the interface we expect it to be on. This
 			 * avoids ARP chaos if an interface is connected to the
 			 * wrong network.
 			 */
 
 			nh = fib4_lookup(ifp->if_fib, isaddr, 0, 0, 0);
 			if (nh == NULL)
 				goto drop;
 			if (nh->nh_ifp != ifp) {
 				ARP_LOG(LOG_INFO, "proxy: ignoring request"
 				    " from %s via %s\n",
 				    inet_ntoa_r(isaddr, addrbuf),
 				    ifp->if_xname);
 				goto drop;
 			}
 
 #ifdef DEBUG_PROXY
 			printf("arp: proxying for %s\n",
 			    inet_ntoa_r(itaddr, addrbuf));
 #endif
 		}
 	}
 
 	if (itaddr.s_addr == myaddr.s_addr &&
 	    IN_LINKLOCAL(ntohl(itaddr.s_addr))) {
 		/* RFC 3927 link-local IPv4; always reply by broadcast. */
 #ifdef DEBUG_LINKLOCAL
 		printf("arp: sending reply for link-local addr %s\n",
 		    inet_ntoa_r(itaddr, addrbuf));
 #endif
 		m->m_flags |= M_BCAST;
 		m->m_flags &= ~M_MCAST;
 	} else {
 		/* default behaviour; never reply by broadcast. */
 		m->m_flags &= ~(M_BCAST|M_MCAST);
 	}
 	(void)memcpy(ar_tpa(ah), ar_spa(ah), ah->ar_pln);
 	(void)memcpy(ar_spa(ah), &itaddr, ah->ar_pln);
 	ah->ar_op = htons(ARPOP_REPLY);
 	ah->ar_pro = htons(ETHERTYPE_IP); /* let's be sure! */
 	m->m_len = sizeof(*ah) + (2 * ah->ar_pln) + (2 * ah->ar_hln);
 	m->m_pkthdr.len = m->m_len;
 	m->m_pkthdr.rcvif = NULL;
 	sa.sa_family = AF_ARP;
 	sa.sa_len = 2;
 
 	/* Calculate link header for sending frame */
 	bzero(&ro, sizeof(ro));
 	linkhdrsize = sizeof(linkhdr);
 	error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize);
 
 	/*
 	 * arp_fillheader() may fail due to lack of support inside encap request
 	 * routing. This is not necessary an error, AF_ARP can/should be handled
 	 * by if_output().
 	 */
 	if (error != 0 && error != EAFNOSUPPORT) {
 		ARP_LOG(LOG_ERR, "Failed to calculate ARP header on %s: %d\n",
 		    if_name(ifp), error);
 		goto drop;
 	}
 
 	ro.ro_prepend = linkhdr;
 	ro.ro_plen = linkhdrsize;
 	ro.ro_flags = 0;
 
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	(*ifp->if_output)(ifp, m, &sa, &ro);
 	ARPSTAT_INC(txreplies);
 	return;
 
 drop:
 	m_freem(m);
 }
 #endif
 
 static struct mbuf *
 arp_grab_holdchain(struct llentry *la)
 {
 	struct mbuf *chain;
 
 	LLE_WLOCK_ASSERT(la);
 
 	chain = la->la_hold;
 	la->la_hold = NULL;
 	la->la_numheld = 0;
 
 	return (chain);
 }
 
 static void
 arp_flush_holdchain(struct ifnet *ifp, struct llentry *la, struct mbuf *chain)
 {
 	struct mbuf *m_hold, *m_hold_next;
 	struct sockaddr_in sin;
 
 	NET_EPOCH_ASSERT();
 
 	struct route ro = {
 		.ro_prepend = la->r_linkdata,
 		.ro_plen = la->r_hdrlen,
 	};
 
 	lltable_fill_sa_entry(la, (struct sockaddr *)&sin);
 
 	for (m_hold = chain; m_hold != NULL; m_hold = m_hold_next) {
 		m_hold_next = m_hold->m_nextpkt;
 		m_hold->m_nextpkt = NULL;
 		/* Avoid confusing lower layers. */
 		m_clrprotoflags(m_hold);
 		(*ifp->if_output)(ifp, m_hold, (struct sockaddr *)&sin, &ro);
 	}
 }
 
 /*
  * Checks received arp data against existing @la.
  * Updates lle state/performs notification if necessary.
  */
 static void
 arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp,
     int bridged, struct llentry *la)
 {
 	uint8_t linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	int lladdr_off;
 	char addrbuf[INET_ADDRSTRLEN];
 
 	LLE_WLOCK_ASSERT(la);
 
 	/* the following is not an error when doing bridging */
 	if (!bridged && la->lle_tbl->llt_ifp != ifp) {
 		if (log_arp_wrong_iface)
 			ARP_LOG(LOG_WARNING, "%s is on %s "
 			    "but got reply from %*D on %s\n",
 			    inet_ntoa_r(isaddr, addrbuf),
 			    la->lle_tbl->llt_ifp->if_xname,
 			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 			    ifp->if_xname);
 		LLE_WUNLOCK(la);
 		return;
 	}
 	if ((la->la_flags & LLE_VALID) &&
 	    bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) {
 		if (la->la_flags & LLE_STATIC) {
 			LLE_WUNLOCK(la);
 			if (log_arp_permanent_modify)
 				ARP_LOG(LOG_ERR,
 				    "%*D attempts to modify "
 				    "permanent entry for %s on %s\n",
 				    ifp->if_addrlen,
 				    (u_char *)ar_sha(ah), ":",
 				    inet_ntoa_r(isaddr, addrbuf),
 				    ifp->if_xname);
 			return;
 		}
 		if (log_arp_movements) {
 			ARP_LOG(LOG_INFO, "%s moved from %*D "
 			    "to %*D on %s\n",
 			    inet_ntoa_r(isaddr, addrbuf),
 			    ifp->if_addrlen,
 			    (u_char *)la->ll_addr, ":",
 			    ifp->if_addrlen, (u_char *)ar_sha(ah), ":",
 			    ifp->if_xname);
 		}
 	}
 
 	/* Calculate full link prepend to use in lle */
 	linkhdrsize = sizeof(linkhdr);
 	if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr,
 	    &linkhdrsize, &lladdr_off) != 0) {
 		LLE_WUNLOCK(la);
 		return;
 	}
 
 	/* Check if something has changed */
 	if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 ||
 	    (la->la_flags & LLE_VALID) == 0) {
 		/* Try to perform LLE update */
 		if (lltable_try_set_entry_addr(ifp, la, linkhdr, linkhdrsize,
 		    lladdr_off) == 0) {
 			LLE_WUNLOCK(la);
 			return;
 		}
 
 		/* Clear fast path feedback request if set */
 		llentry_mark_used(la);
 	}
 
 	arp_mark_lle_reachable(la, ifp);
 
 	/*
 	 * The packets are all freed within the call to the output
 	 * routine.
 	 *
 	 * NB: The lock MUST be released before the call to the
 	 * output routine.
 	 */
 	if (la->la_hold != NULL) {
 		struct mbuf *chain;
 
 		chain = arp_grab_holdchain(la);
 		LLE_WUNLOCK(la);
 		arp_flush_holdchain(ifp, la, chain);
 	} else
 		LLE_WUNLOCK(la);
 }
 
 static void
 arp_mark_lle_reachable(struct llentry *la, struct ifnet *ifp)
 {
 	int canceled, wtime;
 
 	LLE_WLOCK_ASSERT(la);
 
 	la->ln_state = ARP_LLINFO_REACHABLE;
 	EVENTHANDLER_INVOKE(lle_event, la, LLENTRY_RESOLVED);
 
 	if ((ifp->if_flags & IFF_STICKYARP) != 0)
 		la->la_flags |= LLE_STATIC;
 
 	if (!(la->la_flags & LLE_STATIC)) {
 		LLE_ADDREF(la);
 		la->la_expire = time_uptime + V_arpt_keep;
 		wtime = V_arpt_keep - V_arp_maxtries * V_arpt_rexmit;
 		if (wtime < 0)
 			wtime = V_arpt_keep;
 		canceled = callout_reset(&la->lle_timer,
 		    hz * wtime, arptimer, la);
 		if (canceled)
 			LLE_REMREF(la);
 	}
 	la->la_asked = 0;
 	la->la_preempt = V_arp_maxtries;
 }
 
 /*
  * Add permanent link-layer record for given interface address.
  */
 static __noinline void
 arp_add_ifa_lle(struct ifnet *ifp, const struct sockaddr *dst)
 {
 	struct llentry *lle, *lle_tmp;
 
 	/*
 	 * Interface address LLE record is considered static
 	 * because kernel code relies on LLE_STATIC flag to check
 	 * if these entries can be rewriten by arp updates.
 	 */
 	lle = lltable_alloc_entry(LLTABLE(ifp), LLE_IFADDR | LLE_STATIC, dst);
 	if (lle == NULL) {
 		log(LOG_INFO, "arp_ifinit: cannot create arp "
 		    "entry for interface address\n");
 		return;
 	}
 
 	IF_AFDATA_WLOCK(ifp);
 	LLE_WLOCK(lle);
 	/* Unlink any entry if exists */
 	lle_tmp = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst);
 	if (lle_tmp != NULL)
 		lltable_unlink_entry(LLTABLE(ifp), lle_tmp);
 
 	lltable_link_entry(LLTABLE(ifp), lle);
 	IF_AFDATA_WUNLOCK(ifp);
 
 	if (lle_tmp != NULL)
 		EVENTHANDLER_INVOKE(lle_event, lle_tmp, LLENTRY_EXPIRED);
 
 	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_RESOLVED);
 	LLE_WUNLOCK(lle);
 	if (lle_tmp != NULL)
 		lltable_free_entry(LLTABLE(ifp), lle_tmp);
 }
 
 /*
  * Handle the garp_rexmit_count. Like sysctl_handle_int(), but limits the range
  * of valid values.
  */
 static int
 sysctl_garp_rexmit(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int rexmit_count = *(int *)arg1;
 
 	error = sysctl_handle_int(oidp, &rexmit_count, 0, req);
 
 	/* Enforce limits on any new value that may have been set. */
 	if (!error && req->newptr) {
 		/* A new value was set. */
 		if (rexmit_count < 0) {
 			rexmit_count = 0;
 		} else if (rexmit_count > MAX_GARP_RETRANSMITS) {
 			rexmit_count = MAX_GARP_RETRANSMITS;
 		}
 		*(int *)arg1 = rexmit_count;
 	}
 
 	return (error);
 }
 
 /*
  * Retransmit a Gratuitous ARP (GARP) and, if necessary, schedule a callout to
  * retransmit it again. A pending callout owns a reference to the ifa.
  */
 static void
 garp_rexmit(void *arg)
 {
 	struct in_ifaddr *ia = arg;
 
 	if (callout_pending(&ia->ia_garp_timer) ||
 	    !callout_active(&ia->ia_garp_timer)) {
 		IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
 		ifa_free(&ia->ia_ifa);
 		return;
 	}
 
 	CURVNET_SET(ia->ia_ifa.ifa_ifp->if_vnet);
 
 	/*
 	 * Drop lock while the ARP request is generated.
 	 */
 	IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
 
 	arprequest(ia->ia_ifa.ifa_ifp, &IA_SIN(ia)->sin_addr,
 	    &IA_SIN(ia)->sin_addr, IF_LLADDR(ia->ia_ifa.ifa_ifp));
 
 	/*
 	 * Increment the count of retransmissions. If the count has reached the
 	 * maximum value, stop sending the GARP packets. Otherwise, schedule
 	 * the callout to retransmit another GARP packet.
 	 */
 	++ia->ia_garp_count;
 	if (ia->ia_garp_count >= garp_rexmit_count) {
 		ifa_free(&ia->ia_ifa);
 	} else {
 		int rescheduled;
 		IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp);
 		rescheduled = callout_reset(&ia->ia_garp_timer,
 		    (1 << ia->ia_garp_count) * hz,
 		    garp_rexmit, ia);
 		IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
 		if (rescheduled) {
 			ifa_free(&ia->ia_ifa);
 		}
 	}
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Start the GARP retransmit timer.
  *
  * A single GARP is always transmitted when an IPv4 address is added
  * to an interface and that is usually sufficient. However, in some
  * circumstances, such as when a shared address is passed between
  * cluster nodes, this single GARP may occasionally be dropped or
  * lost. This can lead to neighbors on the network link working with a
  * stale ARP cache and sending packets destined for that address to
  * the node that previously owned the address, which may not respond.
  *
  * To avoid this situation, GARP retransmits can be enabled by setting
  * the net.link.ether.inet.garp_rexmit_count sysctl to a value greater
  * than zero. The setting represents the maximum number of
  * retransmissions. The interval between retransmissions is calculated
  * using an exponential backoff algorithm, doubling each time, so the
  * retransmission intervals are: {1, 2, 4, 8, 16, ...} (seconds).
  */
 static void
 garp_timer_start(struct ifaddr *ifa)
 {
 	struct in_ifaddr *ia = (struct in_ifaddr *) ifa;
 
 	IF_ADDR_WLOCK(ia->ia_ifa.ifa_ifp);
 	ia->ia_garp_count = 0;
 	if (callout_reset(&ia->ia_garp_timer, (1 << ia->ia_garp_count) * hz,
 	    garp_rexmit, ia) == 0) {
 		ifa_ref(ifa);
 	}
 	IF_ADDR_WUNLOCK(ia->ia_ifa.ifa_ifp);
 }
 
 void
 arp_ifinit(struct ifnet *ifp, struct ifaddr *ifa)
 {
 	struct epoch_tracker et;
 	const struct sockaddr_in *dst_in;
 	const struct sockaddr *dst;
 
 	if (ifa->ifa_carp != NULL)
 		return;
 
 	dst = ifa->ifa_addr;
 	dst_in = (const struct sockaddr_in *)dst;
 
 	if (ntohl(dst_in->sin_addr.s_addr) == INADDR_ANY)
 		return;
 	NET_EPOCH_ENTER(et);
 	arp_announce_ifaddr(ifp, dst_in->sin_addr, IF_LLADDR(ifp));
 	NET_EPOCH_EXIT(et);
 	if (garp_rexmit_count > 0) {
 		garp_timer_start(ifa);
 	}
 
 	arp_add_ifa_lle(ifp, dst);
 }
 
 void
 arp_announce_ifaddr(struct ifnet *ifp, struct in_addr addr, u_char *enaddr)
 {
 
 	if (ntohl(addr.s_addr) != INADDR_ANY)
 		arprequest(ifp, &addr, &addr, enaddr);
 }
 
 /*
  * Sends gratuitous ARPs for each ifaddr to notify other
  * nodes about the address change.
  */
 static __noinline void
 arp_handle_ifllchange(struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family == AF_INET)
 			arp_ifinit(ifp, ifa);
 	}
 }
 
 /*
  * A handler for interface link layer address change event.
  */
 static void
 arp_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 	/* if_bridge can update its lladdr during if_vmove(), after we've done
 	 * if_detach_internal()/dom_ifdetach(). */
 	if (ifp->if_afdata[AF_INET] == NULL)
 		return;
 
 	lltable_update_ifaddr(LLTABLE(ifp));
 
 	if ((ifp->if_flags & IFF_UP) != 0)
 		arp_handle_ifllchange(ifp);
 }
 
 static void
 vnet_arp_init(void)
 {
 
 	if (IS_DEFAULT_VNET(curvnet)) {
 		netisr_register(&arp_nh);
 		iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event,
 		    arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 	}
 #ifdef VIMAGE
 	else
 		netisr_register_vnet(&arp_nh);
 #endif
 }
 VNET_SYSINIT(vnet_arp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND,
     vnet_arp_init, 0);
 
 #ifdef VIMAGE
 /*
  * We have to unregister ARP along with IP otherwise we risk doing INADDR_HASH
  * lookups after destroying the hash.  Ideally this would go on SI_ORDER_3.5.
  */
 static void
 vnet_arp_destroy(__unused void *arg)
 {
 
 	netisr_unregister_vnet(&arp_nh);
 }
 VNET_SYSUNINIT(vnet_arp_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_arp_destroy, NULL);
 #endif
diff --git a/sys/netinet/igmp.c b/sys/netinet/igmp.c
index cebde1798c6d..394ee81d6941 100644
--- a/sys/netinet/igmp.c
+++ b/sys/netinet/igmp.c
@@ -1,3719 +1,3720 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 /*
  * Internet Group Management Protocol (IGMP) routines.
  * [RFC1112, RFC2236, RFC3376]
  *
  * Written by Steve Deering, Stanford, May 1988.
  * Modified by Rosen Sharma, Stanford, Aug 1994.
  * Modified by Bill Fenner, Xerox PARC, Feb 1995.
  * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
  * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
  *
  * MULTICAST Revision: 3.5.1.4
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/condvar.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/igmp.h>
 #include <netinet/igmp_var.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 #define	IGMP_SLOWHZ	2	/* 2 slow timeouts per second */
 #define	IGMP_FASTHZ	5	/* 5 fast timeouts per second */
 #define	IGMP_RESPONSE_BURST_INTERVAL	(IGMP_FASTHZ / 2)
 
 static struct igmp_ifsoftc *
 		igi_alloc_locked(struct ifnet *);
 static void	igi_delete_locked(const struct ifnet *);
 static void	igmp_dispatch_queue(struct mbufq *, int, const int);
 static void	igmp_fasttimo_vnet(void);
 static void	igmp_final_leave(struct in_multi *, struct igmp_ifsoftc *);
 static int	igmp_handle_state_change(struct in_multi *,
 		    struct igmp_ifsoftc *);
 static int	igmp_initial_join(struct in_multi *, struct igmp_ifsoftc *);
 static int	igmp_input_v1_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v2_query(struct ifnet *, const struct ip *,
 		    const struct igmp *);
 static int	igmp_input_v3_query(struct ifnet *, const struct ip *,
 		    /*const*/ struct igmpv3 *);
 static int	igmp_input_v3_group_query(struct in_multi *,
 		    struct igmp_ifsoftc *, int, /*const*/ struct igmpv3 *);
 static int	igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static int	igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
 		    /*const*/ struct igmp *);
 static void	igmp_intr(struct mbuf *);
 static int	igmp_isgroupreported(const struct in_addr);
 static struct mbuf *
 		igmp_ra_alloc(void);
 #ifdef KTR
 static char *	igmp_rec_type_to_str(const int);
 #endif
 static void	igmp_set_version(struct igmp_ifsoftc *, const int);
 static void	igmp_slowtimo_vnet(void);
 static int	igmp_v1v2_queue_report(struct in_multi *, const int);
 static void	igmp_v1v2_process_group_timer(struct in_multi *, const int);
 static void	igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *);
 static void	igmp_v2_update_group(struct in_multi *, const int);
 static void	igmp_v3_cancel_link_timers(struct igmp_ifsoftc *);
 static void	igmp_v3_dispatch_general_query(struct igmp_ifsoftc *);
 static struct mbuf *
 		igmp_v3_encap_report(struct ifnet *, struct mbuf *);
 static int	igmp_v3_enqueue_group_record(struct mbufq *,
 		    struct in_multi *, const int, const int, const int);
 static int	igmp_v3_enqueue_filter_change(struct mbufq *,
 		    struct in_multi *);
 static void	igmp_v3_process_group_timers(struct in_multi_head *,
 		    struct mbufq *, struct mbufq *, struct in_multi *,
 		    const int);
 static int	igmp_v3_merge_state_changes(struct in_multi *,
 		    struct mbufq *);
 static void	igmp_v3_suppress_group_record(struct in_multi *);
 static int	sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
 static int	sysctl_igmp_stat(SYSCTL_HANDLER_ARGS);
 
 static const struct netisr_handler igmp_nh = {
 	.nh_name = "igmp",
 	.nh_handler = igmp_intr,
 	.nh_proto = NETISR_IGMP,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 /*
  * System-wide globals.
  *
  * Unlocked access to these is OK, except for the global IGMP output
  * queue. The IGMP subsystem lock ends up being system-wide for the moment,
  * because all VIMAGEs have to share a global output queue, as netisrs
  * themselves are not virtualized.
  *
  * Locking:
  *  * The permitted lock order is: IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * All output is delegated to the netisr.
  *    Now that Giant has been eliminated, the netisr may be inlined.
  *  * IN_MULTI_LIST_LOCK covers in_multi.
  *  * IGMP_LOCK covers igmp_ifsoftc and any global variables in this file,
  *    including the output queue.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *  * igmp_ifsoftc is valid as long as PF_INET is attached to the interface,
  *    therefore it is not refcounted.
  *    We allow unlocked reads of igmp_ifsoftc when accessed via in_multi.
  *
  * Reference counting
  *  * IGMP acquires its own reference every time an in_multi is passed to
  *    it and the group is being joined for the first time.
  *  * IGMP releases its reference(s) on in_multi in a deferred way,
  *    because the operations which process the release run as part of
  *    a loop whose control variables are directly affected by the release
  *    (that, and not recursing on the IF_ADDR_LOCK).
  *
  * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
  * to a vnet in ifp->if_vnet.
  *
  * SMPng: XXX We may potentially race operations on ifma_protospec.
  * The problem is that we currently lack a clean way of taking the
  * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
  * as anything which modifies ifma needs to be covered by that lock.
  * So check for ifma_protospec being NULL before proceeding.
  */
 struct mtx		 igmp_mtx;
 
 struct mbuf		*m_raopt;		 /* Router Alert option */
 static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
 
 /*
  * VIMAGE-wide globals.
  *
  * The IGMPv3 timers themselves need to run per-image, however, for
  * historical reasons, timers run globally.  This needs to be improved.
  * An ifnet can only be in one vimage at a time, and the loopback
  * ifnet, loif, is itself virtualized.
  * It would otherwise be possible to seriously hose IGMP state,
  * and create inconsistencies in upstream multicast routing, if you have
  * multiple VIMAGEs running on the same link joining different multicast
  * groups, UNLESS the "primary IP address" is different. This is because
  * IGMP for IPv4 does not force link-local addresses to be used for each
  * node, unlike MLD for IPv6.
  * Obviously the IGMPv3 per-interface state has per-vimage granularity
  * also as a result.
  *
  * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
  * policy to control the address used by IGMP on the link.
  */
 VNET_DEFINE_STATIC(int, interface_timers_running);	/* IGMPv3 general
 							 * query response */
 VNET_DEFINE_STATIC(int, state_change_timers_running);	/* IGMPv3 state-change
 							 * retransmit */
 VNET_DEFINE_STATIC(int, current_state_timers_running);	/* IGMPv1/v2 host
 							 * report; IGMPv3 g/sg
 							 * query response */
 
 #define	V_interface_timers_running	VNET(interface_timers_running)
 #define	V_state_change_timers_running	VNET(state_change_timers_running)
 #define	V_current_state_timers_running	VNET(current_state_timers_running)
 
 VNET_PCPUSTAT_DEFINE(struct igmpstat, igmpstat);
 VNET_PCPUSTAT_SYSINIT(igmpstat);
 VNET_PCPUSTAT_SYSUNINIT(igmpstat);
 
 VNET_DEFINE_STATIC(LIST_HEAD(, igmp_ifsoftc), igi_head) =
     LIST_HEAD_INITIALIZER(igi_head);
 VNET_DEFINE_STATIC(struct timeval, igmp_gsrdelay) = {10, 0};
 
 #define	V_igi_head			VNET(igi_head)
 #define	V_igmp_gsrdelay			VNET(igmp_gsrdelay)
 
 VNET_DEFINE_STATIC(int, igmp_recvifkludge) = 1;
 VNET_DEFINE_STATIC(int, igmp_sendra) = 1;
 VNET_DEFINE_STATIC(int, igmp_sendlocal) = 1;
 VNET_DEFINE_STATIC(int, igmp_v1enable) = 1;
 VNET_DEFINE_STATIC(int, igmp_v2enable) = 1;
 VNET_DEFINE_STATIC(int, igmp_legacysupp);
 VNET_DEFINE_STATIC(int, igmp_default_version) = IGMP_VERSION_3;
 
 #define	V_igmp_recvifkludge		VNET(igmp_recvifkludge)
 #define	V_igmp_sendra			VNET(igmp_sendra)
 #define	V_igmp_sendlocal		VNET(igmp_sendlocal)
 #define	V_igmp_v1enable			VNET(igmp_v1enable)
 #define	V_igmp_v2enable			VNET(igmp_v2enable)
 #define	V_igmp_legacysupp		VNET(igmp_legacysupp)
 #define	V_igmp_default_version		VNET(igmp_default_version)
 
 /*
  * Virtualized sysctls.
  */
 SYSCTL_PROC(_net_inet_igmp, IGMPCTL_STATS, stats,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmpstat), 0, sysctl_igmp_stat, "S,igmpstat",
     "IGMP statistics (struct igmpstat, netinet/igmp_var.h)");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_recvifkludge), 0,
     "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendra), 0,
     "Send IP Router Alert option in IGMPv2/v3 messages");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_sendlocal), 0,
     "Send IGMP membership reports for 224.0.0.0/24 groups");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v1enable), 0,
     "Enable backwards compatibility with IGMPv1");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_v2enable), 0,
     "Enable backwards compatibility with IGMPv2");
 SYSCTL_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(igmp_legacysupp), 0,
     "Allow v1/v2 reports to suppress v3 group responses");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, default_version,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
     "Default version of IGMP to run on each interface");
 SYSCTL_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
     "Rate limit for IGMPv3 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_igmp_ifinfo,
     "Per-interface IGMPv3 state");
 
 static __inline void
 igmp_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.rcvif = ifp;
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 igmp_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 /*
  * Restore context from a queued IGMP output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 igmp_restore_context(struct mbuf *m)
 {
 
 #ifdef notyet
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == (m->m_pkthdr.PH_loc.ptr),
 	    ("%s: called when curvnet was not restored", __func__));
 #endif
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * IGMP statistics.
  */
 static int
 sysctl_igmp_stat(SYSCTL_HANDLER_ARGS)
 {
 	struct igmpstat igps0;
 	int error;
 	char *p;
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct igmpstat));
 	if (error)
 		return (error);
 
 	if (req->oldptr != NULL) {
 		if (req->oldlen < sizeof(struct igmpstat))
 			error = ENOMEM;
 		else {
 			/*
 			 * Copy the counters, and explicitly set the struct's
 			 * version and length fields.
 			 */
 			COUNTER_ARRAY_COPY(VNET(igmpstat), &igps0,
 			    sizeof(struct igmpstat) / sizeof(uint64_t));
 			igps0.igps_version = IGPS_VERSION_3;
 			igps0.igps_len = IGPS_VERSION3_LEN;
 			error = SYSCTL_OUT(req, &igps0,
 			    sizeof(struct igmpstat));
 		}
 	} else
 		req->validlen = sizeof(struct igmpstat);
 	if (error)
 		goto out;
 	if (req->newptr != NULL) {
 		if (req->newlen < sizeof(struct igmpstat))
 			error = ENOMEM;
 		else
 			error = SYSCTL_IN(req, &igps0,
 			    sizeof(igps0));
 		if (error)
 			goto out;
 		/*
 		 * igps0 must be "all zero".
 		 */
 		p = (char *)&igps0;
 		while (p < (char *)&igps0 + sizeof(igps0) && *p == '\0')
 			p++;
 		if (p != (char *)&igps0 + sizeof(igps0)) {
 			error = EINVAL;
 			goto out;
 		}
 		COUNTER_ARRAY_ZERO(VNET(igmpstat),
 		    sizeof(struct igmpstat) / sizeof(uint64_t));
 	}
 out:
 	return (error);
 }
 
 /*
  * Retrieve or set default IGMP version.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
 {
 	int	 error;
 	int	 new;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	new = V_igmp_default_version;
 
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
 	     V_igmp_default_version, new);
 
 	V_igmp_default_version = new;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by IGMP lock.
  */
 static int
 sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	IGMP_LOCK();
 
 	i = V_igmp_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
 	     V_igmp_gsrdelay.tv_sec, i);
 	V_igmp_gsrdelay.tv_sec = i;
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct igmp_ifsoftc to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * SMPng: NOTE: Does an unlocked ifindex space read.
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	struct epoch_tracker	 et;
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct igmp_ifsoftc	*igi;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
 	if (error)
 		return (error);
 
 	IN_MULTI_LIST_LOCK();
 	IGMP_LOCK();
 
 	error = ENOENT;
 
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(name[0]);
 	NET_EPOCH_EXIT(et);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		if (ifp == igi->igi_ifp) {
 			struct igmp_ifinfo info;
 
 			info.igi_version = igi->igi_version;
 			info.igi_v1_timer = igi->igi_v1_timer;
 			info.igi_v2_timer = igi->igi_v2_timer;
 			info.igi_v3_timer = igi->igi_v3_timer;
 			info.igi_flags = igi->igi_flags;
 			info.igi_rv = igi->igi_rv;
 			info.igi_qi = igi->igi_qi;
 			info.igi_qri = igi->igi_qri;
 			info.igi_uri = igi->igi_uri;
 			error = SYSCTL_OUT(req, &info, sizeof(info));
 			break;
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_LIST_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains
  * using the netisr.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 igmp_dispatch_queue(struct mbufq *mq, int limit, const int loop)
 {
 	struct epoch_tracker et;
 	struct mbuf *m;
 
 	NET_EPOCH_ENTER(et);
 	while ((m = mbufq_dequeue(mq)) != NULL) {
 		CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, mq, m);
 		if (loop)
 			m->m_flags |= M_IGMP_LOOP;
 		netisr_dispatch(NETISR_IGMP, m);
 		if (--limit == 0)
 			break;
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * Filter outgoing IGMP report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
  * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
  * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
  * this may break certain IGMP snooping switches which rely on the old
  * report behaviour.
  *
  * Return zero if the given group is one for which IGMP reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 igmp_isgroupreported(const struct in_addr addr)
 {
 
 	if (in_allhosts(addr) ||
 	    ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
 		return (0);
 
 	return (1);
 }
 
 /*
  * Construct a Router Alert option to use in outgoing packets.
  */
 static struct mbuf *
 igmp_ra_alloc(void)
 {
 	struct mbuf	*m;
 	struct ipoption	*p;
 
 	m = m_get(M_WAITOK, MT_DATA);
 	p = mtod(m, struct ipoption *);
 	p->ipopt_dst.s_addr = INADDR_ANY;
 	p->ipopt_list[0] = (char)IPOPT_RA;	/* Router Alert Option */
 	p->ipopt_list[1] = 0x04;		/* 4 bytes long */
 	p->ipopt_list[2] = IPOPT_EOL;		/* End of IP option list */
 	p->ipopt_list[3] = 0x00;		/* pad byte */
 	m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
 
 	return (m);
 }
 
 /*
  * Attach IGMP when PF_INET is attached to an interface.
  */
 struct igmp_ifsoftc *
 igmp_domifattach(struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi;
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 
 	igi = igi_alloc_locked(ifp);
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		igi->igi_flags |= IGIF_SILENT;
 
 	IGMP_UNLOCK();
 
 	return (igi);
 }
 
 /*
  * VIMAGE: assume curvnet set by caller.
  */
 static struct igmp_ifsoftc *
 igi_alloc_locked(/*const*/ struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi;
 
 	IGMP_LOCK_ASSERT();
 
 	igi = malloc(sizeof(struct igmp_ifsoftc), M_IGMP, M_NOWAIT|M_ZERO);
 	if (igi == NULL)
 		goto out;
 
 	igi->igi_ifp = ifp;
 	igi->igi_version = V_igmp_default_version;
 	igi->igi_flags = 0;
 	igi->igi_rv = IGMP_RV_INIT;
 	igi->igi_qi = IGMP_QI_INIT;
 	igi->igi_qri = IGMP_QRI_INIT;
 	igi->igi_uri = IGMP_URI_INIT;
 	mbufq_init(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
 
 	LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
 
 	CTR2(KTR_IGMPV3, "allocate igmp_ifsoftc for ifp %p(%s)",
 	     ifp, ifp->if_xname);
 
 out:
 	return (igi);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  *
  * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
  * XXX This is also bitten by unlocked ifma_protospec access.
  */
 void
 igmp_ifdetach(struct ifnet *ifp)
 {
 	struct epoch_tracker	 et;
 	struct igmp_ifsoftc	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm;
 	struct in_multi_head inm_free_tmp;
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
 	    ifp->if_xname);
 
 	SLIST_INIT(&inm_free_tmp);
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	if (igi->igi_version == IGMP_VERSION_3) {
 		IF_ADDR_WLOCK(ifp);
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			inm = inm_ifmultiaddr_get_inm(ifma);
 			if (inm == NULL)
 				continue;
 			if (inm->inm_state == IGMP_LEAVING_MEMBER)
 				inm_rele_locked(&inm_free_tmp, inm);
 			inm_clear_recorded(inm);
 		}
 		NET_EPOCH_EXIT(et);
 		IF_ADDR_WUNLOCK(ifp);
 		inm_release_list_deferred(&inm_free_tmp);
 	}
 	IGMP_UNLOCK();
 
 }
 
 /*
  * Hook for domifdetach.
  */
 void
 igmp_domifdetach(struct ifnet *ifp)
 {
 
 	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK();
 	igi_delete_locked(ifp);
 	IGMP_UNLOCK();
 }
 
 static void
 igi_delete_locked(const struct ifnet *ifp)
 {
 	struct igmp_ifsoftc *igi, *tigi;
 
 	CTR3(KTR_IGMPV3, "%s: freeing igmp_ifsoftc for ifp %p(%s)",
 	    __func__, ifp, ifp->if_xname);
 
 	IGMP_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
 		if (igi->igi_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			mbufq_drain(&igi->igi_gq);
 
 			LIST_REMOVE(igi, igi_link);
 			free(igi, M_IGMP);
 			return;
 		}
 	}
 }
 
 /*
  * Process a received IGMPv1 query.
  * Return non-zero if the message should be dropped.
  *
  * VIMAGE: The curvnet pointer is derived from the input ifp.
  */
 static int
 igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 
 	NET_EPOCH_ASSERT();
 
 	/*
 	 * IGMPv1 Host Mmembership Queries SHOULD always be addressed to
 	 * 224.0.0.1. They are always treated as General Queries.
 	 * igmp_group is always ignored. Do not drop it as a userland
 	 * daemon may wish to see it.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) {
 		IGMPSTAT_INC(igps_rcv_badqueries);
 		return (0);
 	}
 	IGMPSTAT_INC(igps_rcv_gen_queries);
 
 	IN_MULTI_LIST_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Switch to IGMPv1 host compatibility mode.
 	 */
 	igmp_set_version(igi, IGMP_VERSION_1);
 
 	CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	/*
 	 * Start the timers in all of our group records
 	 * for the interface on which the query arrived,
 	 * except those which are already running.
 	 */
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = inm_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		if (inm->inm_timer != 0)
 			continue;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(
 			    IGMP_V1V2_MAX_RI * IGMP_FASTHZ);
 			V_current_state_timers_running = 1;
 			break;
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_LIST_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 general or group-specific query.
  */
 static int
 igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
     const struct igmp *igmp)
 {
 	struct ifmultiaddr	*ifma;
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 
 	NET_EPOCH_ASSERT();
 
 	is_general_query = 0;
 
 	/*
 	 * Validate address fields upfront.
 	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
 	 */
 	if (in_nullhost(igmp->igmp_group)) {
 		/*
 		 * IGMPv2 General Query.
 		 * If this was not sent to the all-hosts group, ignore it.
 		 */
 		if (!in_allhosts(ip->ip_dst))
 			return (0);
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		is_general_query = 1;
 	} else {
 		/* IGMPv2 Group-Specific Query. */
 		IGMPSTAT_INC(igps_rcv_group_queries);
 	}
 
 	IN_MULTI_LIST_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Ignore v2 query if in v1 Compatibility Mode.
 	 */
 	if (igi->igi_version == IGMP_VERSION_1)
 		goto out_locked;
 
 	igmp_set_version(igi, IGMP_VERSION_2);
 
 	timer = igmp->igmp_code * IGMP_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			inm = inm_ifmultiaddr_get_inm(ifma);
 			if (inm == NULL)
 				continue;
 			igmp_v2_update_group(inm, timer);
 		}
 	} else {
 		/*
 		 * Group-specific IGMPv2 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = inm_lookup(ifp, igmp->igmp_group);
 		if (inm != NULL) {
 			CTR3(KTR_IGMPV3,
 			    "process v2 query 0x%08x on ifp %p(%s)",
 			    ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname);
 			igmp_v2_update_group(inm, timer);
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_LIST_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an IGMPv2 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to IGMPv3. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses.
  *
  * Unlike IGMPv3, the delay per group should be jittered
  * to avoid bursts of IGMPv2 reports.
  */
 static void
 igmp_v2_update_group(struct in_multi *inm, const int timer)
 {
 
 	CTR4(KTR_IGMPV3, "0x%08x: %s/%s timer=%d", __func__,
 	    ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname, timer);
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (inm->inm_timer != 0 &&
 		    inm->inm_timer <= timer) {
 			CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		break;
 	case IGMP_SLEEPING_MEMBER:
 		CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
 		inm->inm_state = IGMP_AWAKENING_MEMBER;
 		break;
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received IGMPv3 general, group-specific or
  * group-and-source-specific query.
  * Assumes m has already been pulled up to the full IGMP message length.
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
     /*const*/ struct igmpv3 *igmpv3)
 {
 	struct igmp_ifsoftc	*igi;
 	struct in_multi		*inm;
 	int			 is_general_query;
 	uint32_t		 maxresp, nsrc, qqi;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 
 	is_general_query = 0;
 
 	CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
 
 	maxresp = igmpv3->igmp_code;	/* in 1/10ths of a second */
 	if (maxresp >= 128) {
 		maxresp = IGMP_MANT(igmpv3->igmp_code) <<
 			  (IGMP_EXP(igmpv3->igmp_code) + 3);
 	}
 
 	/*
 	 * Robustness must never be less than 2 for on-wire IGMPv3.
 	 * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make
 	 * an exception for interfaces whose IGMPv3 state changes
 	 * are redirected to loopback (e.g. MANET).
 	 */
 	qrv = IGMP_QRV(igmpv3->igmp_misc);
 	if (qrv < 2) {
 		CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
 		    qrv, IGMP_RV_INIT);
 		qrv = IGMP_RV_INIT;
 	}
 
 	qqi = igmpv3->igmp_qqi;
 	if (qqi >= 128) {
 		qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
 		     (IGMP_EXP(igmpv3->igmp_qqi) + 3);
 	}
 
 	timer = maxresp * IGMP_FASTHZ / IGMP_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Validate address fields and versions upfront before
 	 * accepting v3 query.
 	 * XXX SMPng: Unlocked access to igmpstat counters here.
 	 */
 	if (in_nullhost(igmpv3->igmp_group)) {
 		/*
 		 * IGMPv3 General Query.
 		 *
 		 * General Queries SHOULD be directed to 224.0.0.1.
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		IGMPSTAT_INC(igps_rcv_gen_queries);
 		if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
 			IGMPSTAT_INC(igps_rcv_badqueries);
 			return (0);
 		}
 		is_general_query = 1;
 	} else {
 		/* Group or group-source specific query. */
 		if (nsrc == 0)
 			IGMPSTAT_INC(igps_rcv_group_queries);
 		else
 			IGMPSTAT_INC(igps_rcv_gsr_queries);
 	}
 
 	IN_MULTI_LIST_LOCK();
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	if (igi->igi_flags & IGIF_LOOPBACK) {
 		CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	/*
 	 * Discard the v3 query if we're in Compatibility Mode.
 	 * The RFC is not obviously worded that hosts need to stay in
 	 * compatibility mode until the Old Version Querier Present
 	 * timer expires.
 	 */
 	if (igi->igi_version != IGMP_VERSION_3) {
 		CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)",
 		    igi->igi_version, ifp, ifp->if_xname);
 		goto out_locked;
 	}
 
 	igmp_set_version(igi, IGMP_VERSION_3);
 	igi->igi_rv = qrv;
 	igi->igi_qi = qqi;
 	igi->igi_qri = maxresp;
 
 	CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
 	    maxresp);
 
 	if (is_general_query) {
 		/*
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
 		    ifp, ifp->if_xname);
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
 			igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
 			V_interface_timers_running = 1;
 		}
 	} else {
 		/*
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		inm = inm_lookup(ifp, igmpv3->igmp_group);
 		if (inm == NULL)
 			goto out_locked;
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->inm_lastgsrtv,
 			    &V_igmp_gsrdelay)) {
 				CTR1(KTR_IGMPV3, "%s: GS query throttled.",
 				    __func__);
 				IGMPSTAT_INC(igps_drop_gsr_queries);
 				goto out_locked;
 			}
 		}
 		CTR3(KTR_IGMPV3, "process v3 0x%08x query on ifp %p(%s)",
 		     ntohl(igmpv3->igmp_group.s_addr), ifp, ifp->if_xname);
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
 			igmp_input_v3_group_query(inm, igi, timer, igmpv3);
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_LIST_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv3 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occurred. Currently this is ignored.
  */
 static int
 igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifsoftc *igi,
     int timer, /*const*/ struct igmpv3 *igmpv3)
 {
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	retval = 0;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(igmpv3->igmp_numsrc);
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 		    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
 			inm_clear_recorded(inm);
 			timer = min(inm->inm_timer, timer);
 		}
 		inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->inm_timer, timer);
 		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 		V_current_state_timers_running = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 * FIXME: Handling source lists larger than 1 mbuf requires that
 	 * we pass the mbuf chain pointer down to this function, and use
 	 * m_getptr() to walk the chain.
 	 */
 	if (inm->inm_nsrc > 0) {
 		const struct in_addr	*ap;
 		int			 i, nrecorded;
 
 		ap = (const struct in_addr *)(igmpv3 + 1);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++, ap++) {
 			retval = inm_record_source(inm, ap->s_addr);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_IGMPV3,
 			    "%s: schedule response to SG query", __func__);
 			inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
 			inm->inm_timer = IGMP_RANDOM_DELAY(timer);
 			V_current_state_timers_running = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received IGMPv1 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		IFP_TO_IA(ifp, ia);
 		if (ia != NULL)
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 	}
 
 	CTR3(KTR_IGMPV3, "process v1 report 0x%08x on ifp %p(%s)",
 	     ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, stop our group timer and transition to the 'lazy' state.
 	 */
 	IN_MULTI_LIST_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifsoftc *igi;
 
 		igi = inm->inm_igi;
 		if (igi == NULL) {
 			KASSERT(igi != NULL,
 			    ("%s: no igi for ifp %p", __func__, ifp));
 			goto out_locked;
 		}
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for 0x%08x on ifp %p(%s)",
 			    ntohl(igmp->igmp_group.s_addr), ifp,
 			    ifp->if_xname);
 		case IGMP_SLEEPING_MEMBER:
 			inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_REPORTING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for 0x%08x on ifp %p(%s)",
 			    ntohl(igmp->igmp_group.s_addr), ifp,
 			    ifp->if_xname);
 			if (igi->igi_version == IGMP_VERSION_1)
 				inm->inm_state = IGMP_LAZY_MEMBER;
 			else if (igi->igi_version == IGMP_VERSION_2)
 				inm->inm_state = IGMP_SLEEPING_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_LIST_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received IGMPv2 host membership report.
  *
  * NOTE: 0.0.0.0 workaround breaks const correctness.
  */
 static int
 igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
     /*const*/ struct igmp *igmp)
 {
 	struct in_ifaddr *ia;
 	struct in_multi *inm;
 
 	/*
 	 * Make sure we don't hear our own membership report.  Fast
 	 * leave requires knowing that we are the only member of a
 	 * group.
 	 */
 	IFP_TO_IA(ifp, ia);
 	if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
 		return (0);
 	}
 
 	IGMPSTAT_INC(igps_rcv_reports);
 
 	if (ifp->if_flags & IFF_LOOPBACK) {
 		return (0);
 	}
 
 	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
 	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
 		IGMPSTAT_INC(igps_rcv_badreports);
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
 	 * Booting clients may use the source address 0.0.0.0. Some
 	 * IGMP daemons may not know how to use IP_RECVIF to determine
 	 * the interface upon which this message was received.
 	 * Replace 0.0.0.0 with the subnet address if told to do so.
 	 */
 	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
 		if (ia != NULL)
 			ip->ip_src.s_addr = htonl(ia->ia_subnet);
 	}
 
 	CTR3(KTR_IGMPV3, "process v2 report 0x%08x on ifp %p(%s)",
 	     ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname);
 
 	/*
 	 * IGMPv2 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	IN_MULTI_LIST_LOCK();
 	inm = inm_lookup(ifp, igmp->igmp_group);
 	if (inm != NULL) {
 		struct igmp_ifsoftc *igi;
 
 		igi = inm->inm_igi;
 		KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
 
 		IGMPSTAT_INC(igps_rcv_ourreports);
 
 		/*
 		 * If we are in IGMPv3 host mode, do not allow the
 		 * other host's IGMPv1 report to suppress our reports
 		 * unless explicitly configured to do so.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3) {
 			if (V_igmp_legacysupp)
 				igmp_v3_suppress_group_record(inm);
 			goto out_locked;
 		}
 
 		inm->inm_timer = 0;
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			CTR3(KTR_IGMPV3,
 			    "report suppressed for 0x%08x on ifp %p(%s)",
 			    ntohl(igmp->igmp_group.s_addr), ifp, ifp->if_xname);
 		case IGMP_LAZY_MEMBER:
 			inm->inm_state = IGMP_LAZY_MEMBER;
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	IN_MULTI_LIST_UNLOCK();
 
 	return (0);
 }
 
 int
 igmp_input(struct mbuf **mp, int *offp, int proto)
 {
 	int iphlen;
 	struct ifnet *ifp;
 	struct igmp *igmp;
 	struct ip *ip;
 	struct mbuf *m;
 	int igmplen;
 	int minlen;
 	int queryver;
 
 	CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, *mp, *offp);
 
 	m = *mp;
 	ifp = m->m_pkthdr.rcvif;
 	*mp = NULL;
 
 	IGMPSTAT_INC(igps_rcv_total);
 
 	ip = mtod(m, struct ip *);
 	iphlen = *offp;
 	igmplen = ntohs(ip->ip_len) - iphlen;
 
 	/*
 	 * Validate lengths.
 	 */
 	if (igmplen < IGMP_MINLEN) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Always pullup to the minimum size for v1/v2 or v3
 	 * to amortize calls to m_pullup().
 	 */
 	minlen = iphlen;
 	if (igmplen >= IGMP_V3_QUERY_MINLEN)
 		minlen += IGMP_V3_QUERY_MINLEN;
 	else
 		minlen += IGMP_MINLEN;
 	if ((!M_WRITABLE(m) || m->m_len < minlen) &&
 	    (m = m_pullup(m, minlen)) == NULL) {
 		IGMPSTAT_INC(igps_rcv_tooshort);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 
 	/*
 	 * Validate checksum.
 	 */
 	m->m_data += iphlen;
 	m->m_len -= iphlen;
 	igmp = mtod(m, struct igmp *);
 	if (in_cksum(m, igmplen)) {
 		IGMPSTAT_INC(igps_rcv_badsum);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= iphlen;
 	m->m_len += iphlen;
 
 	/*
 	 * IGMP control traffic is link-scope, and must have a TTL of 1.
 	 * DVMRP traffic (e.g. mrinfo, mtrace) is an exception;
 	 * probe packets may come from beyond the LAN.
 	 */
 	if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) {
 		IGMPSTAT_INC(igps_rcv_badttl);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	switch (igmp->igmp_type) {
 	case IGMP_HOST_MEMBERSHIP_QUERY:
 		if (igmplen == IGMP_MINLEN) {
 			if (igmp->igmp_code == 0)
 				queryver = IGMP_VERSION_1;
 			else
 				queryver = IGMP_VERSION_2;
 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
 			queryver = IGMP_VERSION_3;
 		} else {
 			IGMPSTAT_INC(igps_rcv_tooshort);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		switch (queryver) {
 		case IGMP_VERSION_1:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v1enable)
 				break;
 			if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_2:
 			IGMPSTAT_INC(igps_rcv_v1v2_queries);
 			if (!V_igmp_v2enable)
 				break;
 			if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
 				m_freem(m);
 				return (IPPROTO_DONE);
 			}
 			break;
 
 		case IGMP_VERSION_3: {
 				struct igmpv3 *igmpv3;
 				uint16_t igmpv3len;
 				uint16_t nsrc;
 
 				IGMPSTAT_INC(igps_rcv_v3_queries);
 				igmpv3 = (struct igmpv3 *)igmp;
 				/*
 				 * Validate length based on source count.
 				 */
 				nsrc = ntohs(igmpv3->igmp_numsrc);
 				if (nsrc * sizeof(in_addr_t) >
 				    UINT16_MAX - iphlen - IGMP_V3_QUERY_MINLEN) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					m_freem(m);
 					return (IPPROTO_DONE);
 				}
 				/*
 				 * m_pullup() may modify m, so pullup in
 				 * this scope.
 				 */
 				igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
 				   sizeof(struct in_addr) * nsrc;
 				if ((!M_WRITABLE(m) ||
 				     m->m_len < igmpv3len) &&
 				    (m = m_pullup(m, igmpv3len)) == NULL) {
 					IGMPSTAT_INC(igps_rcv_tooshort);
 					return (IPPROTO_DONE);
 				}
 				igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
 				    + iphlen);
 				if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
 					m_freem(m);
 					return (IPPROTO_DONE);
 				}
 			}
 			break;
 		}
 		break;
 
 	case IGMP_v1_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v1enable)
 			break;
 		if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v2_HOST_MEMBERSHIP_REPORT:
 		if (!V_igmp_v2enable)
 			break;
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	case IGMP_v3_HOST_MEMBERSHIP_REPORT:
 		/*
 		 * Hosts do not need to process IGMPv3 membership reports,
 		 * as report suppression is no longer required.
 		 */
 		if (!ip_checkrouteralert(m))
 			IGMPSTAT_INC(igps_rcv_nora);
 		break;
 
 	default:
 		break;
 	}
 
 	/*
 	 * Pass all valid IGMP packets up to any process(es) listening on a
 	 * raw IGMP socket.
 	 */
 	*mp = m;
 	return (rip_input(mp, offp, proto));
 }
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 static struct callout igmpfast_callout;
 static void
 igmp_fasttimo(void *arg __unused)
 {
 	struct epoch_tracker et;
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	NET_EPOCH_ENTER(et);
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_fasttimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	NET_EPOCH_EXIT(et);
 
 	callout_reset(&igmpfast_callout, hz / IGMP_FASTHZ, igmp_fasttimo, NULL);
 }
 
 /*
  * Fast timeout handler (per-vnet).
  * Sends are shuffled off to a netisr to deal with Giant.
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 igmp_fasttimo_vnet(void)
 {
 	struct mbufq		 scq;	/* State-change packets */
 	struct mbufq		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct igmp_ifsoftc	*igi;
 	struct ifmultiaddr	*ifma;
 	struct in_multi		*inm;
 	struct in_multi_head inm_free_tmp;
 	int			 loop, uri_fasthz;
 
 	loop = 0;
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running &&
 	    !V_interface_timers_running &&
 	    !V_state_change_timers_running)
 		return;
 
 	SLIST_INIT(&inm_free_tmp);
 	IN_MULTI_LIST_LOCK();
 	IGMP_LOCK();
 
 	/*
 	 * IGMPv3 General Query response timer processing.
 	 */
 	if (V_interface_timers_running) {
 		CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
 
 		V_interface_timers_running = 0;
 		LIST_FOREACH(igi, &V_igi_head, igi_link) {
 			if (igi->igi_v3_timer == 0) {
 				/* Do nothing. */
 			} else if (--igi->igi_v3_timer == 0) {
 				igmp_v3_dispatch_general_query(igi);
 			} else {
 				V_interface_timers_running = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running &&
 	    !V_state_change_timers_running)
 		goto out_locked;
 
 	V_current_state_timers_running = 0;
 	V_state_change_timers_running = 0;
 
 	CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
 
 	/*
 	 * IGMPv1/v2/v3 host report and state-change timer processing.
 	 * Note: Processing a v3 group timer may remove a node.
 	 */
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		ifp = igi->igi_ifp;
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 			uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
 			    IGMP_FASTHZ);
 			mbufq_init(&qrq, IGMP_MAX_G_GS_PACKETS);
 			mbufq_init(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_WLOCK(ifp);
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			inm = inm_ifmultiaddr_get_inm(ifma);
 			if (inm == NULL)
 				continue;
 			switch (igi->igi_version) {
 			case IGMP_VERSION_1:
 			case IGMP_VERSION_2:
 				igmp_v1v2_process_group_timer(inm,
 				    igi->igi_version);
 				break;
 			case IGMP_VERSION_3:
 				igmp_v3_process_group_timers(&inm_free_tmp, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_WUNLOCK(ifp);
 
 		if (igi->igi_version == IGMP_VERSION_3) {
 			igmp_dispatch_queue(&qrq, 0, loop);
 			igmp_dispatch_queue(&scq, 0, loop);
 
 			/*
 			 * Free the in_multi reference(s) for this
 			 * IGMP lifecycle.
 			 */
 			inm_release_list_deferred(&inm_free_tmp);
 		}
 	}
 
 out_locked:
 	IGMP_UNLOCK();
 	IN_MULTI_LIST_UNLOCK();
 }
 
 /*
  * Update host report group timer for IGMPv1/v2.
  * Will update the global pending timer flags.
  */
 static void
 igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
 {
 	int report_timer_expired;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	if (inm->inm_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 		return;
 	}
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		break;
 	case IGMP_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			(void)igmp_v1v2_queue_report(inm,
 			    (version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 		}
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for IGMPv3.
  * Will update the global pending timer flags.
  * Note: Unlocked read from igi.
  */
 static void
 igmp_v3_process_group_timers(struct in_multi_head *inmh,
     struct mbufq *qrq, struct mbufq *scq,
     struct in_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from v1/v2 compatibility mode back to v3,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->inm_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->inm_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running = 1;
 	}
 
 	if (inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->inm_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_LAZY_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 		break;
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval __unused;
 
 			retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			/* XXX Clear recorded sources for next time. */
 			inm_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->inm_scrv > 0) {
 				inm->inm_sctimer = uri_fasthz;
 				V_state_change_timers_running = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)igmp_v3_merge_state_changes(inm, scq);
 
 			inm_commit(inm);
 			CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__,
 			    ntohl(inm->inm_addr.s_addr),
 			    inm->inm_ifp->if_xname);
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release IGMP's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->inm_state == IGMP_LEAVING_MEMBER &&
 			    inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				inm_rele_locked(inmh, inm);
 			}
 		}
 		break;
 	}
 }
 
 /*
  * Suppress a group's pending response to a group or source/group query.
  *
  * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
  * Do NOT update ST1/ST0 as this operation merely suppresses
  * the currently pending group record.
  * Do NOT suppress the response to a general query. It is possible but
  * it would require adding another state or flag.
  */
 static void
 igmp_v3_suppress_group_record(struct in_multi *inm)
 {
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
 		("%s: not IGMPv3 mode on link", __func__));
 
 	if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
 	    inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
 		return;
 
 	if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 		inm_clear_recorded(inm);
 
 	inm->inm_timer = 0;
 	inm->inm_state = IGMP_REPORTING_MEMBER;
 }
 
 /*
  * Switch to a different IGMP version on the given interface,
  * as per Section 7.2.1.
  */
 static void
 igmp_set_version(struct igmp_ifsoftc *igi, const int version)
 {
 	int old_version_timer;
 
 	IGMP_LOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 8.12.
 		 */
 		old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
 		old_version_timer *= IGMP_SLOWHZ;
 
 		if (version == IGMP_VERSION_1) {
 			igi->igi_v1_timer = old_version_timer;
 			igi->igi_v2_timer = 0;
 		} else if (version == IGMP_VERSION_2) {
 			igi->igi_v1_timer = 0;
 			igi->igi_v2_timer = old_version_timer;
 		}
 	}
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_2) {
 			igi->igi_version = IGMP_VERSION_2;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		if (igi->igi_version != IGMP_VERSION_1) {
 			igi->igi_version = IGMP_VERSION_1;
 			igmp_v3_cancel_link_timers(igi);
 		}
 	}
 }
 
 /*
  * Cancel pending IGMPv3 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  *
  * Only ever called on a transition from v3 to Compatibility mode. Kill
  * the timers stone dead (this may be expensive for large N groups), they
  * will be restarted if Compatibility Mode deems that they must be due to
  * query processing.
  */
 static void
 igmp_v3_cancel_link_timers(struct igmp_ifsoftc *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm;
 	struct in_multi_head inm_free_tmp;
 
 	CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
 	    igi->igi_ifp, igi->igi_ifp->if_xname);
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 	NET_EPOCH_ASSERT();
 
 	SLIST_INIT(&inm_free_tmp);
 
 	/*
 	 * Stop the v3 General Query Response on this link stone dead.
 	 * If fasttimo is woken up due to V_interface_timers_running,
 	 * the flag will be cleared if there are no pending link timers.
 	 */
 	igi->igi_v3_timer = 0;
 
 	/*
 	 * Now clear the current-state and state-change report timers
 	 * for all memberships scoped to this link.
 	 */
 	ifp = igi->igi_ifp;
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = inm_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			/*
 			 * These states are either not relevant in v3 mode,
 			 * or are unreported. Do nothing.
 			 */
 			break;
 		case IGMP_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching to
 			 * compatibility mode, we need to release the final
 			 * reference held for issuing the INCLUDE {}, and
 			 * transition to REPORTING to ensure the host leave
 			 * message is sent upstream to the old querier --
 			 * transition to NOT would lose the leave and race.
 			 */
 			inm_rele_locked(&inm_free_tmp, inm);
 			/* FALLTHROUGH */
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 			inm_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case IGMP_REPORTING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			break;
 		}
 		/*
 		 * Always clear state-change and group report timers.
 		 * Free any pending IGMPv3 state-change records.
 		 */
 		inm->inm_sctimer = 0;
 		inm->inm_timer = 0;
 		mbufq_drain(&inm->inm_scq);
 	}
 	IF_ADDR_WUNLOCK(ifp);
 
 	inm_release_list_deferred(&inm_free_tmp);
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 7.2.1 of RFC 3376.
  */
 static void
 igmp_v1v2_process_querier_timers(struct igmp_ifsoftc *igi)
 {
 
 	IGMP_LOCK_ASSERT();
 
 	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
 		/*
 		 * IGMPv1 and IGMPv2 Querier Present timers expired.
 		 *
 		 * Revert to IGMPv3.
 		 */
 		if (igi->igi_version != IGMP_VERSION_3) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_version = IGMP_VERSION_3;
 		}
 	} else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer expired,
 		 * IGMPv2 Querier Present timer running.
 		 * If IGMPv2 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv2 is enabled, revert to IGMPv2.
 		 */
 		if (!V_igmp_v2enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v2_timer;
 			if (igi->igi_version != IGMP_VERSION_2) {
 				CTR5(KTR_IGMPV3,
 				    "%s: transition from v%d -> v%d on %p(%s)",
 				    __func__, igi->igi_version, IGMP_VERSION_2,
 				    igi->igi_ifp, igi->igi_ifp->if_xname);
 				igi->igi_version = IGMP_VERSION_2;
 				igmp_v3_cancel_link_timers(igi);
 			}
 		}
 	} else if (igi->igi_v1_timer > 0) {
 		/*
 		 * IGMPv1 Querier Present timer running.
 		 * Stop IGMPv2 timer if running.
 		 *
 		 * If IGMPv1 was disabled since last timeout,
 		 * revert to IGMPv3.
 		 * If IGMPv1 is enabled, reset IGMPv2 timer if running.
 		 */
 		if (!V_igmp_v1enable) {
 			CTR5(KTR_IGMPV3,
 			    "%s: transition from v%d -> v%d on %p(%s)",
 			    __func__, igi->igi_version, IGMP_VERSION_3,
 			    igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v1_timer = 0;
 			igi->igi_version = IGMP_VERSION_3;
 		} else {
 			--igi->igi_v1_timer;
 		}
 		if (igi->igi_v2_timer > 0) {
 			CTR3(KTR_IGMPV3,
 			    "%s: cancel v2 timer on %p(%s)",
 			    __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
 			igi->igi_v2_timer = 0;
 		}
 	}
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 static struct callout igmpslow_callout;
 static void
 igmp_slowtimo(void *arg __unused)
 {
 	struct epoch_tracker et;
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	NET_EPOCH_ENTER(et);
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		igmp_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	NET_EPOCH_EXIT(et);
 
 	callout_reset(&igmpslow_callout, hz / IGMP_SLOWHZ, igmp_slowtimo, NULL);
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 igmp_slowtimo_vnet(void)
 {
 	struct igmp_ifsoftc *igi;
 
 	IGMP_LOCK();
 
 	LIST_FOREACH(igi, &V_igi_head, igi_link) {
 		igmp_v1v2_process_querier_timers(igi);
 	}
 
 	IGMP_UNLOCK();
 }
 
 /*
  * Dispatch an IGMPv1/v2 host report or leave message.
  * These are always small enough to fit inside a single mbuf.
  */
 static int
 igmp_v1v2_queue_report(struct in_multi *inm, const int type)
 {
 	struct epoch_tracker 	et;
 	struct ifnet		*ifp;
 	struct igmp		*igmp;
 	struct ip		*ip;
 	struct mbuf		*m;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	ifp = inm->inm_ifp;
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOMEM);
 	M_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
 
 	m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len = sizeof(struct igmp);
 
 	igmp = mtod(m, struct igmp *);
 	igmp->igmp_type = type;
 	igmp->igmp_code = 0;
 	igmp->igmp_group = inm->inm_addr;
 	igmp->igmp_cksum = 0;
 	igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = 0;
 	ip->ip_len = htons(sizeof(struct ip) + sizeof(struct igmp));
 	ip->ip_off = 0;
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (type == IGMP_HOST_LEAVE_MESSAGE)
 		ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
 	else
 		ip->ip_dst = inm->inm_addr;
 
 	igmp_save_context(m, ifp);
 
 	m->m_flags |= M_IGMPV2;
 	if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
 		m->m_flags |= M_IGMP_LOOP;
 
 	CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
 	NET_EPOCH_ENTER(et);
 	netisr_dispatch(NETISR_IGMP, m);
 	NET_EPOCH_EXIT(et);
 
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv4 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to IGMP to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the IGMPv3 state machine at group level. The IGMP module
  * however makes the decision as to which IGMP protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
  * save ourselves a bunch of work; any exclusive mode groups need not
  * compute source filter lists.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 igmp_change_state(struct in_multi *inm)
 {
 	struct igmp_ifsoftc *igi;
 	struct ifnet *ifp;
 	int error;
 
 	error = 0;
 	IN_MULTI_LOCK_ASSERT();
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	if (ifp == NULL)
 		return (0);
 	/*
 	 * Sanity check that netinet's notion of ifp is the
 	 * same as net's.
 	 */
 	KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 
 	IGMP_LOCK();
 
 	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
 	KASSERT(igi != NULL, ("%s: no igmp_ifsoftc for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an IGMP
 	 * life cycle for this group.
 	 */
 	if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
 		CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
 		    inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
 		if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: initial join", __func__);
 			error = igmp_initial_join(inm, igi);
 			goto out_locked;
 		} else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_IGMPV3, "%s: final leave", __func__);
 			igmp_final_leave(inm, igi);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
 	}
 
 	error = igmp_handle_state_change(inm, igi);
 
 out_locked:
 	IGMP_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an IGMP group.
  *
  * When joining a group:
  *  If the group should have its IGMP traffic suppressed, do nothing.
  *  IGMPv1 starts sending IGMPv1 host membership reports.
  *  IGMPv2 starts sending IGMPv2 host membership reports.
  *  IGMPv3 will schedule an IGMPv3 state-change report containing the
  *  initial state of the membership.
  */
 static int
 igmp_initial_join(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	struct ifnet		*ifp;
 	struct mbufq		*mq;
 	int			 error, retval, syncstates;
 
 	CTR4(KTR_IGMPV3, "%s: initial join 0x%08x on ifp %p(%s)", __func__,
 	    ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname);
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
 	 * are never reported in any IGMP protocol exchanges.
 	 * All other groups enter the appropriate IGMP state machine
 	 * for the version in use on this link.
 	 * A link marked as IGIF_SILENT causes IGMP to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr)) {
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		inm->inm_state = IGMP_SILENT_MEMBER;
 		inm->inm_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (igi->igi_version == IGMP_VERSION_3 &&
 		    inm->inm_state == IGMP_LEAVING_MEMBER) {
 			MPASS(inm->inm_refcount > 1);
 			inm_rele_locked(NULL, inm);
 		}
 		inm->inm_state = IGMP_REPORTING_MEMBER;
 
 		switch (igi->igi_version) {
 		case IGMP_VERSION_1:
 		case IGMP_VERSION_2:
 			inm->inm_state = IGMP_IDLE_MEMBER;
 			error = igmp_v1v2_queue_report(inm,
 			    (igi->igi_version == IGMP_VERSION_2) ?
 			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
 			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
 			if (error == 0) {
 				inm->inm_timer = IGMP_RANDOM_DELAY(
 				    IGMP_V1V2_MAX_RI * IGMP_FASTHZ);
 				V_current_state_timers_running = 1;
 			}
 			break;
 
 		case IGMP_VERSION_3:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			mq = &inm->inm_scq;
 			mbufq_drain(mq);
 			retval = igmp_v3_enqueue_group_record(mq, inm, 1,
 			    0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next igmp_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 */
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				KASSERT(igi->igi_rv > 1,
 				   ("%s: invalid robustness %d", __func__,
 				    igi->igi_rv));
 				inm->inm_scrv = igi->igi_rv;
 			}
 			inm->inm_sctimer = 1;
 			V_state_change_timers_running = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__,
 		    ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname);
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the IGMP life-cycle.
  */
 static int
 igmp_handle_state_change(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 
 	CTR4(KTR_IGMPV3, "%s: state change for 0x%08x on ifp %p(%s)", __func__,
 	    ntohl(inm->inm_addr.s_addr), inm->inm_ifp, inm->inm_ifp->if_xname);
 
 	ifp = inm->inm_ifp;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (igi->igi_flags & IGIF_SILENT) ||
 	    !igmp_isgroupreported(inm->inm_addr) ||
 	    (igi->igi_version != IGMP_VERSION_3)) {
 		if (!igmp_isgroupreported(inm->inm_addr)) {
 			CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__,
 		    ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	mbufq_drain(&inm->inm_scq);
 
 	retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
 	CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
 	inm->inm_sctimer = 1;
 	V_state_change_timers_running = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for an IGMP group.
  *
  * When leaving a group:
  *  IGMPv1 does nothing.
  *  IGMPv2 sends a host leave message, if and only if we are the reporter.
  *  IGMPv3 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 igmp_final_leave(struct in_multi *inm, struct igmp_ifsoftc *igi)
 {
 	int syncstates;
 
 	syncstates = 1;
 
 	CTR4(KTR_IGMPV3, "%s: final leave 0x%08x on ifp %p(%s)",
 	    __func__, ntohl(inm->inm_addr.s_addr), inm->inm_ifp,
 	    inm->inm_ifp->if_xname);
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	switch (inm->inm_state) {
 	case IGMP_NOT_MEMBER:
 	case IGMP_SILENT_MEMBER:
 	case IGMP_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_IGMPV3,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case IGMP_REPORTING_MEMBER:
 	case IGMP_IDLE_MEMBER:
 	case IGMP_G_QUERY_PENDING_MEMBER:
 	case IGMP_SG_QUERY_PENDING_MEMBER:
 		if (igi->igi_version == IGMP_VERSION_2) {
 #ifdef INVARIANTS
 			if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
 			    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
 			panic("%s: IGMPv3 state reached, not IGMPv3 mode",
 			     __func__);
 #endif
 			igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
 			inm->inm_state = IGMP_NOT_MEMBER;
 		} else if (igi->igi_version == IGMP_VERSION_3) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			mbufq_drain(&inm->inm_scq);
 			inm->inm_timer = 0;
 			if (igi->igi_flags & IGIF_LOOPBACK) {
 				inm->inm_scrv = 1;
 			} else {
 				inm->inm_scrv = igi->igi_rv;
 			}
 			CTR4(KTR_IGMPV3, "%s: Leaving 0x%08x/%s with %d "
 			    "pending retransmissions.", __func__,
 			    ntohl(inm->inm_addr.s_addr),
 			    inm->inm_ifp->if_xname, inm->inm_scrv);
 			if (inm->inm_scrv == 0) {
 				inm->inm_state = IGMP_NOT_MEMBER;
 				inm->inm_sctimer = 0;
 			} else {
 				int retval __unused;
 
 				inm_acquire_locked(inm);
 
 				retval = igmp_v3_enqueue_group_record(
 				    &inm->inm_scq, inm, 1, 0, 0);
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->inm_state = IGMP_LEAVING_MEMBER;
 				inm->inm_sctimer = 1;
 				V_state_change_timers_running = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case IGMP_LAZY_MEMBER:
 	case IGMP_SLEEPING_MEMBER:
 	case IGMP_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		inm_commit(inm);
 		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for 0x%08x/%s", __func__,
 		    ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for 0x%08x/%s",
 		    __func__, ntohl(inm->inm_addr.s_addr),
 		    inm->inm_ifp->if_xname);
 	}
 }
 
 /*
  * Enqueue an IGMPv3 group record to the given output queue.
  *
  * XXX This function could do with having the allocation code
  * split out, and the multiple-tree-walks coalesced into a single
  * routine as has been done in igmp_v3_enqueue_filter_change().
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IP/IGMP header to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_group_record(struct mbufq *mq, struct in_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query)
 {
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ifnet		*ifp;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	in_addr_t		 naddr;
 	uint8_t			 mode;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	ifp = inm->inm_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pig = NULL;
 	type = IGMP_DO_NOTHING;
 	mode = inm->inm_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
 	    inm->inm_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 */
 		if (mode != inm->inm_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
 				    __func__);
 				type = IGMP_CHANGE_TO_INCLUDE_MODE;
 				if (mode == MCAST_UNDEFINED)
 					record_has_sources = 0;
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = IGMP_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = IGMP_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = IGMP_MODE_IS_INCLUDE;
 			KASSERT(inm->inm_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->inm_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (igmp_v3_enqueue_filter_change(mq, inm));
 
 	if (type == IGMP_DO_NOTHING) {
 		CTR3(KTR_IGMPV3, "%s: nothing to do for 0x%08x/%s", __func__,
 		    ntohl(inm->inm_addr.s_addr), inm->inm_ifp->if_xname);
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct igmp_grouprec);
 	if (record_has_sources)
 		minrec0len += sizeof(in_addr_t);
 
 	CTR4(KTR_IGMPV3, "%s: queueing %s for 0x%08x/%s", __func__,
 	    igmp_rec_type_to_str(type), ntohl(inm->inm_addr.s_addr),
 	    inm->inm_ifp->if_xname);
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = mbufq_last(mq);
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		m = m0;
 		CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
 	} else {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 		if (!is_state_change && !is_group_query) {
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 			if (m)
 				m->m_data += IGMP_LEADINGSPACE;
 		}
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 
 		igmp_save_context(m, ifp);
 
 		CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	ig.ig_type = type;
 	ig.ig_datalen = 0;
 	ig.ig_numsrc = 0;
 	ig.ig_group = inm->inm_addr;
 	if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct igmp_grouprec);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, do not
 	 * include source entries.
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__,
 			    ims->ims_haddr);
 			now = ims_get_mode(inm, ims, 1);
 			CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(in_addr_t);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.vt_nrecs = 1;
 		mbufq_enqueue(mq, m);
 	} else
 		m->m_pkthdr.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m)
 			m->m_data += IGMP_LEADINGSPACE;
 		if (m == NULL) {
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 			if (m)
 				M_ALIGN(m, IGMP_LEADINGSPACE);
 		}
 		if (m == NULL)
 			return (-ENOMEM);
 		igmp_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.vt_nrecs = 1;
 		nbytes += sizeof(struct igmp_grouprec);
 
 		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 			CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__,
 			    ims->ims_haddr);
 			now = ims_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode && mode == MCAST_UNDEFINED)) {
 				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->ims_stp == 0) {
 				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_IGMPV3, "%s: append node", __func__);
 			naddr = htonl(ims->ims_haddr);
 			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pig->ig_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(in_addr_t));
 
 		CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
 		mbufq_enqueue(mq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an IGMPv3 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 igmp_v3_enqueue_filter_change(struct mbufq *mq, struct in_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
 	struct ifnet		*ifp;
 	struct igmp_grouprec	 ig;
 	struct igmp_grouprec	*pig;
 	struct ip_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	in_addr_t		 naddr;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 #ifdef KTR
 	int			 nallow, nblock;
 #endif
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	if (inm->inm_nsrc == 0 ||
 	    (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->inm_ifp;			/* interface */
 	mode = inm->inm_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	npbytes = 0;	/* # of bytes appended this packet */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 #ifdef KTR
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 #endif
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = mbufq_last(mq);
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.vt_nrecs + 1 <=
 			     IGMP_V3_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				CTR1(KTR_IGMPV3,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m)
 					m->m_data += IGMP_LEADINGSPACE;
 				if (m == NULL) {
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 					if (m)
 						M_ALIGN(m, IGMP_LEADINGSPACE);
 				}
 				if (m == NULL) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.vt_nrecs = 0;
 				igmp_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
 				    sizeof(struct igmp_grouprec)) /
 				    sizeof(in_addr_t);
 				npbytes = 0;
 				CTR1(KTR_IGMPV3,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the IGMP group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&ig, 0, sizeof(ig));
 			ig.ig_group = inm->inm_addr;
 			if (!m_append(m, sizeof(ig), (void *)&ig)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_IGMPV3,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct igmp_grouprec);
 			if (m != m0) {
 				/* new packet; offset in c hain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct igmp_grouprec), &off);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pig = (struct igmp_grouprec *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct igmp_grouprec));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL)
 				nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
 			RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
 				CTR2(KTR_IGMPV3, "%s: visit node 0x%08x",
 				    __func__, ims->ims_haddr);
 				now = ims_get_mode(inm, ims, 1);
 				then = ims_get_mode(inm, ims, 0);
 				CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_IGMPV3,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				naddr = htonl(ims->ims_haddr);
 				if (!m_append(m, sizeof(in_addr_t),
 				    (void *)&naddr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_IGMPV3,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 #ifdef KTR
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 #endif
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct igmp_grouprec);
 				if (m != m0) {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_IGMPV3,
 					    "%s: m_adj(m, -ig)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct igmp_grouprec)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(in_addr_t));
 			if (crt == REC_ALLOW)
 				pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
 			pig->ig_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.vt_nrecs++;
 			if (m != m0)
 				mbufq_enqueue(mq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 igmp_v3_merge_state_changes(struct in_multi *inm, struct mbufq *scq)
 {
 	struct mbufq	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->inm_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->inm_scq;
 #ifdef KTR
 	if (mbufq_first(gq) == NULL) {
 		CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = mbufq_first(gq);
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an IGMPv3 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = mbufq_last(scq);
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.vt_nrecs +
 			    m->m_pkthdr.vt_nrecs <=
 			    IGMP_V3_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && mbufq_full(gq)) {
 			CTR2(KTR_IGMPV3,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
 			m0 = mbufq_dequeue(gq);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_IGMPV3, "%s: queueing %p to scq %p)",
 			    __func__, m0, scq);
 			mbufq_enqueue(scq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_IGMPV3, "%s: merging %p with scq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.vt_nrecs +=
 			    m0->m_pkthdr.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending IGMPv3 General Query.
  */
 static void
 igmp_v3_dispatch_general_query(struct igmp_ifsoftc *igi)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in_multi		*inm;
 	int			 retval __unused, loop;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IGMP_LOCK_ASSERT();
 	NET_EPOCH_ASSERT();
 
 	KASSERT(igi->igi_version == IGMP_VERSION_3,
 	    ("%s: called when version %d", __func__, igi->igi_version));
 
 	/*
 	 * Check that there are some packets queued. If so, send them first.
 	 * For large number of groups the reply to general query can take
 	 * many packets, we should finish sending them before starting of
 	 * queuing the new reply.
 	 */
 	if (mbufq_len(&igi->igi_gq) != 0)
 		goto send;
 
 	ifp = igi->igi_ifp;
 
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = inm_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		KASSERT(ifp == inm->inm_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->inm_state) {
 		case IGMP_NOT_MEMBER:
 		case IGMP_SILENT_MEMBER:
 			break;
 		case IGMP_REPORTING_MEMBER:
 		case IGMP_IDLE_MEMBER:
 		case IGMP_LAZY_MEMBER:
 		case IGMP_SLEEPING_MEMBER:
 		case IGMP_AWAKENING_MEMBER:
 			inm->inm_state = IGMP_REPORTING_MEMBER;
 			retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
 			    inm, 0, 0, 0);
 			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case IGMP_G_QUERY_PENDING_MEMBER:
 		case IGMP_SG_QUERY_PENDING_MEMBER:
 		case IGMP_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 send:
 	loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
 	igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (mbufq_first(&igi->igi_gq) != NULL) {
 		igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
 		    IGMP_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running = 1;
 	}
 }
 
 /*
  * Transmit the next pending IGMP message in the output queue.
  *
  * We get called from netisr_processqueue(). A mutex private to igmpoq
  * will be acquired and released around this routine.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as IGMP traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 igmp_intr(struct mbuf *m)
 {
 	struct ip_moptions	 imo;
 	struct ifnet		*ifp;
 	struct mbuf		*ipopts, *m0;
 	int			 error;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	CURVNET_SET((struct vnet *)(m->m_pkthdr.PH_loc.ptr));
 	ifindex = igmp_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IPSTAT_INC(ips_noroute);
 		goto out;
 	}
 
 	ipopts = V_igmp_sendra ? m_raopt : NULL;
 
 	imo.imo_multicast_ttl  = 1;
 	imo.imo_multicast_vif  = -1;
 	imo.imo_multicast_loop = (V_ip_mrouter != NULL);
 
 	/*
 	 * If the user requested that IGMP traffic be explicitly
 	 * redirected to the loopback interface (e.g. they are running a
 	 * MANET interface and the routing protocol needs to see the
 	 * updates), handle this now.
 	 */
 	if (m->m_flags & M_IGMP_LOOP)
 		imo.imo_multicast_ifp = V_loif;
 	else
 		imo.imo_multicast_ifp = ifp;
 
 	if (m->m_flags & M_IGMPV2) {
 		m0 = m;
 	} else {
 		m0 = igmp_v3_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
 			m_freem(m);
 			IPSTAT_INC(ips_odropped);
 			goto out;
 		}
 	}
 
 	igmp_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 #ifdef MAC
 	mac_netinet_igmp_send(ifp, m0);
 #endif
 	error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
 	if (error) {
 		CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 
 	IGMPSTAT_INC(igps_snd_reports);
 
 out:
 	/*
 	 * We must restore the existing vnet pointer before
 	 * continuing as we are run from netisr context.
 	 */
 	CURVNET_RESTORE();
 }
 
 /*
  * Encapsulate an IGMPv3 report.
  *
  * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
  * chain has already had its IP/IGMPv3 header prepended. In this case
  * the function will not attempt to prepend; the lengths and checksums
  * will however be re-computed.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct igmp_report	*igmp;
 	struct ip		*ip;
 	int			 hdrlen, igmpreclen;
 
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	igmpreclen = m_length(m, NULL);
 	hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
 
 	if (m->m_flags & M_IGMPV3_HDR) {
 		igmpreclen -= hdrlen;
 	} else {
 		M_PREPEND(m, hdrlen, M_NOWAIT);
 		if (m == NULL)
 			return (NULL);
 		m->m_flags |= M_IGMPV3_HDR;
 	}
 
 	CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
 
 	m->m_data += sizeof(struct ip);
 	m->m_len -= sizeof(struct ip);
 
 	igmp = mtod(m, struct igmp_report *);
 	igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
 	igmp->ir_rsv1 = 0;
 	igmp->ir_rsv2 = 0;
 	igmp->ir_numgrps = htons(m->m_pkthdr.vt_nrecs);
 	igmp->ir_cksum = 0;
 	igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
 	m->m_pkthdr.vt_nrecs = 0;
 
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 
 	ip = mtod(m, struct ip *);
 	ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
 	ip->ip_len = htons(hdrlen + igmpreclen);
 	ip->ip_off = htons(IP_DF);
 	ip->ip_p = IPPROTO_IGMP;
 	ip->ip_sum = 0;
 
 	ip->ip_src.s_addr = INADDR_ANY;
 
 	if (m->m_flags & M_IGMP_LOOP) {
 		struct in_ifaddr *ia;
 
 		IFP_TO_IA(ifp, ia);
 		if (ia != NULL)
 			ip->ip_src = ia->ia_addr.sin_addr;
 	}
 
 	ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
 
 	return (m);
 }
 
 #ifdef KTR
 static char *
 igmp_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case IGMP_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case IGMP_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case IGMP_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case IGMP_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case IGMP_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case IGMP_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
 #ifdef VIMAGE
 static void
 vnet_igmp_init(const void *unused __unused)
 {
 
 	netisr_register_vnet(&igmp_nh);
 }
 VNET_SYSINIT(vnet_igmp_init, SI_SUB_PROTO_MC, SI_ORDER_ANY,
     vnet_igmp_init, NULL);
 
 static void
 vnet_igmp_uninit(const void *unused __unused)
 {
 
 	/* This can happen when we shutdown the entire network stack. */
 	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
 
 	netisr_unregister_vnet(&igmp_nh);
 }
 VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY,
     vnet_igmp_uninit, NULL);
 #endif
 
 #ifdef DDB
 DB_SHOW_COMMAND(igi_list, db_show_igi_list)
 {
 	struct igmp_ifsoftc *igi, *tigi;
 	LIST_HEAD(_igi_list, igmp_ifsoftc) *igi_head;
 
 	if (!have_addr) {
 		db_printf("usage: show igi_list <addr>\n");
 		return;
 	}
 	igi_head = (struct _igi_list *)addr;
 
 	LIST_FOREACH_SAFE(igi, igi_head, igi_link, tigi) {
 		db_printf("igmp_ifsoftc %p:\n", igi);
 		db_printf("    ifp %p\n", igi->igi_ifp);
 		db_printf("    version %u\n", igi->igi_version);
 		db_printf("    v1_timer %u\n", igi->igi_v1_timer);
 		db_printf("    v2_timer %u\n", igi->igi_v2_timer);
 		db_printf("    v3_timer %u\n", igi->igi_v3_timer);
 		db_printf("    flags %#x\n", igi->igi_flags);
 		db_printf("    rv %u\n", igi->igi_rv);
 		db_printf("    qi %u\n", igi->igi_qi);
 		db_printf("    qri %u\n", igi->igi_qri);
 		db_printf("    uri %u\n", igi->igi_uri);
 		/* struct mbufq    igi_gq; */
 		db_printf("\n");
 	}
 }
 #endif
 
 static int
 igmp_modevent(module_t mod, int type, void *unused __unused)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		CTR1(KTR_IGMPV3, "%s: initializing", __func__);
 		IGMP_LOCK_INIT();
 		m_raopt = igmp_ra_alloc();
 		netisr_register(&igmp_nh);
 		callout_init(&igmpslow_callout, 1);
 		callout_reset(&igmpslow_callout, hz / IGMP_SLOWHZ,
 		    igmp_slowtimo, NULL);
 		callout_init(&igmpfast_callout, 1);
 		callout_reset(&igmpfast_callout, hz / IGMP_FASTHZ,
 		    igmp_fasttimo, NULL);
 		break;
 	case MOD_UNLOAD:
 		CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
 		netisr_unregister(&igmp_nh);
 		m_free(m_raopt);
 		m_raopt = NULL;
 		IGMP_LOCK_DESTROY();
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 	return (0);
 }
 
 static moduledata_t igmp_mod = {
     "igmp",
     igmp_modevent,
     0
 };
 DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE);
diff --git a/sys/netinet/in.c b/sys/netinet/in.c
index b6b412042dad..a17e3cac8744 100644
--- a/sys/netinet/in.c
+++ b/sys/netinet/in.c
@@ -1,1851 +1,1852 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (C) 2001 WIDE Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in.c	8.4 (Berkeley) 1/9/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #define IN_HISTORICAL_NETS		/* include class masks */
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/systm.h>
 #include <sys/sockio.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/socket.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/sx.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/route/route_ctl.h>
 #include <net/vnet.h>
 
 #include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_carp.h>
 #include <netinet/igmp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 static int in_aifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct ucred *);
 static int in_difaddr_ioctl(u_long, caddr_t, struct ifnet *, struct ucred *);
 static int in_gifaddr_ioctl(u_long, caddr_t, struct ifnet *, struct ucred *);
 
 static void	in_socktrim(struct sockaddr_in *);
 static void	in_purgemaddrs(struct ifnet *);
 
 static bool	ia_need_loopback_route(const struct in_ifaddr *);
 
 VNET_DEFINE_STATIC(int, nosameprefix);
 #define	V_nosameprefix			VNET(nosameprefix)
 SYSCTL_INT(_net_inet_ip, OID_AUTO, no_same_prefix, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(nosameprefix), 0,
 	"Refuse to create same prefixes on different interfaces");
 
 VNET_DEFINE_STATIC(bool, broadcast_lowest);
 #define	V_broadcast_lowest		VNET(broadcast_lowest)
 SYSCTL_BOOL(_net_inet_ip, OID_AUTO, broadcast_lowest, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(broadcast_lowest), 0,
 	"Treat lowest address on a subnet (host 0) as broadcast");
 
 VNET_DEFINE(bool, ip_allow_net240) = false;
 #define	V_ip_allow_net240		VNET(ip_allow_net240)
 SYSCTL_BOOL(_net_inet_ip, OID_AUTO, allow_net240,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_allow_net240), 0,
 	"Allow use of Experimental addresses, aka Class E (240/4)");
 /* see https://datatracker.ietf.org/doc/draft-schoen-intarea-unicast-240 */
 
 VNET_DEFINE(bool, ip_allow_net0) = false;
 SYSCTL_BOOL(_net_inet_ip, OID_AUTO, allow_net0,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_allow_net0), 0,
 	"Allow use of addresses in network 0/8");
 /* see https://datatracker.ietf.org/doc/draft-schoen-intarea-unicast-0 */
 
 VNET_DEFINE(uint32_t, in_loopback_mask) = IN_LOOPBACK_MASK_DFLT;
 #define	V_in_loopback_mask	VNET(in_loopback_mask)
 static int sysctl_loopback_prefixlen(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_net_inet_ip, OID_AUTO, loopback_prefixlen,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
 	NULL, 0, sysctl_loopback_prefixlen, "I",
 	"Prefix length of address space reserved for loopback");
 /* see https://datatracker.ietf.org/doc/draft-schoen-intarea-unicast-127 */
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 #define	V_ripcbinfo			VNET(ripcbinfo)
 
 static struct sx in_control_sx;
 SX_SYSINIT(in_control_sx, &in_control_sx, "in_control");
 
 /*
  * Return 1 if an internet address is for a ``local'' host
  * (one to which we have a connection).
  */
 int
 in_localaddr(struct in_addr in)
 {
 	u_long i = ntohl(in.s_addr);
 	struct in_ifaddr *ia;
 
 	NET_EPOCH_ASSERT();
 
 	CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 		if ((i & ia->ia_subnetmask) == ia->ia_subnet)
 			return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Return 1 if an internet address is for the local host and configured
  * on one of its interfaces.
  */
 bool
 in_localip(struct in_addr in)
 {
 	struct in_ifaddr *ia;
 
 	NET_EPOCH_ASSERT();
 
 	CK_LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash)
 		if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr)
 			return (true);
 
 	return (false);
 }
 
 /*
  * Like in_localip(), but FIB-aware.
  */
 bool
 in_localip_fib(struct in_addr in, uint16_t fib)
 {
 	struct in_ifaddr *ia;
 
 	NET_EPOCH_ASSERT();
 
 	CK_LIST_FOREACH(ia, INADDR_HASH(in.s_addr), ia_hash)
 		if (IA_SIN(ia)->sin_addr.s_addr == in.s_addr &&
 		    ia->ia_ifa.ifa_ifp->if_fib == fib)
 			return (true);
 
 	return (false);
 }
 
 /*
  * Return 1 if an internet address is configured on an interface.
  */
 int
 in_ifhasaddr(struct ifnet *ifp, struct in_addr in)
 {
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 
 	NET_EPOCH_ASSERT();
 
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 		ia = (struct in_ifaddr *)ifa;
 		if (ia->ia_addr.sin_addr.s_addr == in.s_addr)
 			return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Return a reference to the interface address which is different to
  * the supplied one but with same IP address value.
  */
 static struct in_ifaddr *
 in_localip_more(struct in_ifaddr *original_ia)
 {
 	struct epoch_tracker et;
 	in_addr_t original_addr = IA_SIN(original_ia)->sin_addr.s_addr;
 	uint32_t original_fib = original_ia->ia_ifa.ifa_ifp->if_fib;
 	struct in_ifaddr *ia;
 
 	NET_EPOCH_ENTER(et);
 	CK_LIST_FOREACH(ia, INADDR_HASH(original_addr), ia_hash) {
 		in_addr_t addr = IA_SIN(ia)->sin_addr.s_addr;
 		uint32_t fib = ia->ia_ifa.ifa_ifp->if_fib;
 		if (!V_rt_add_addr_allfibs && (original_fib != fib))
 			continue;
 		if ((original_ia != ia) && (original_addr == addr)) {
 			ifa_ref(&ia->ia_ifa);
 			NET_EPOCH_EXIT(et);
 			return (ia);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (NULL);
 }
 
 /*
  * Tries to find first IPv4 address in the provided fib.
  * Prefers non-loopback addresses and return loopback IFF
  * @loopback_ok is set.
  *
  * Returns ifa or NULL.
  */
 struct in_ifaddr *
 in_findlocal(uint32_t fibnum, bool loopback_ok)
 {
 	struct in_ifaddr *ia = NULL, *ia_lo = NULL;
 
 	NET_EPOCH_ASSERT();
 
 	CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 		uint32_t ia_fib = ia->ia_ifa.ifa_ifp->if_fib;
 		if (!V_rt_add_addr_allfibs && (fibnum != ia_fib))
 			continue;
 
 		if (!IN_LOOPBACK(ntohl(IA_SIN(ia)->sin_addr.s_addr)))
 			break;
 		if (loopback_ok)
 			ia_lo = ia;
 	}
 
 	if (ia == NULL)
 		ia = ia_lo;
 
 	return (ia);
 }
 
 /*
  * Determine whether an IP address is in a reserved set of addresses
  * that may not be forwarded, or whether datagrams to that destination
  * may be forwarded.
  */
 int
 in_canforward(struct in_addr in)
 {
 	u_long i = ntohl(in.s_addr);
 
 	if (IN_MULTICAST(i) || IN_LINKLOCAL(i) || IN_LOOPBACK(i))
 		return (0);
 	if (IN_EXPERIMENTAL(i) && !V_ip_allow_net240)
 		return (0);
 	if (IN_ZERONET(i) && !V_ip_allow_net0)
 		return (0);
 	return (1);
 }
 
 /*
  * Sysctl to manage prefix of reserved loopback network; translate
  * to/from mask.  The mask is always contiguous high-order 1 bits
  * followed by all 0 bits.
  */
 static int
 sysctl_loopback_prefixlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, preflen;
 
 	/* ffs is 1-based; compensate. */
 	preflen = 33 - ffs(V_in_loopback_mask);
 	error = sysctl_handle_int(oidp, &preflen, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (preflen < 8 || preflen > 31)
 		return (EINVAL);
 	V_in_loopback_mask = 0xffffffff << (32 - preflen);
 	return (0);
 }
 
 /*
  * Trim a mask in a sockaddr
  */
 static void
 in_socktrim(struct sockaddr_in *ap)
 {
     char *cplim = (char *) &ap->sin_addr;
     char *cp = (char *) (&ap->sin_addr + 1);
 
     ap->sin_len = 0;
     while (--cp >= cplim)
 	if (*cp) {
 	    (ap)->sin_len = cp - (char *) (ap) + 1;
 	    break;
 	}
 }
 
 /*
  * Generic internet control operations (ioctl's).
  */
 int
 in_control(struct socket *so, u_long cmd, void *data, struct ifnet *ifp,
     struct thread *td)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct sockaddr_in *addr = (struct sockaddr_in *)&ifr->ifr_addr;
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	int error;
 
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 
 	struct ucred *cred = (td != NULL) ? td->td_ucred : NULL;
 
 	/*
 	 * Filter out 4 ioctls we implement directly.  Forward the rest
 	 * to specific functions and ifp->if_ioctl().
 	 */
 	switch (cmd) {
 	case SIOCGIFADDR:
 	case SIOCGIFBRDADDR:
 	case SIOCGIFDSTADDR:
 	case SIOCGIFNETMASK:
 		break;
 	case SIOCGIFALIAS:
 		sx_xlock(&in_control_sx);
 		error = in_gifaddr_ioctl(cmd, data, ifp, cred);
 		sx_xunlock(&in_control_sx);
 		return (error);
 	case SIOCDIFADDR:
 		sx_xlock(&in_control_sx);
 		error = in_difaddr_ioctl(cmd, data, ifp, cred);
 		sx_xunlock(&in_control_sx);
 		return (error);
 	case OSIOCAIFADDR:	/* 9.x compat */
 	case SIOCAIFADDR:
 		sx_xlock(&in_control_sx);
 		error = in_aifaddr_ioctl(cmd, data, ifp, cred);
 		sx_xunlock(&in_control_sx);
 		return (error);
 	case SIOCSIFADDR:
 	case SIOCSIFBRDADDR:
 	case SIOCSIFDSTADDR:
 	case SIOCSIFNETMASK:
 		/* We no longer support that old commands. */
 		return (EINVAL);
 	default:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		return ((*ifp->if_ioctl)(ifp, cmd, data));
 	}
 
 	if (addr->sin_addr.s_addr != INADDR_ANY &&
 	    prison_check_ip4(cred, &addr->sin_addr) != 0)
 		return (EADDRNOTAVAIL);
 
 	/*
 	 * Find address for this interface, if it exists.  If an
 	 * address was specified, find that one instead of the
 	 * first one on the interface, if possible.
 	 */
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 		ia = (struct in_ifaddr *)ifa;
 		if (ia->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr)
 			break;
 	}
 	if (ifa == NULL)
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 			if (ifa->ifa_addr->sa_family == AF_INET) {
 				ia = (struct in_ifaddr *)ifa;
 				if (prison_check_ip4(cred,
 				    &ia->ia_addr.sin_addr) == 0)
 					break;
 			}
 
 	if (ifa == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (EADDRNOTAVAIL);
 	}
 
 	error = 0;
 	switch (cmd) {
 	case SIOCGIFADDR:
 		*addr = ia->ia_addr;
 		break;
 
 	case SIOCGIFBRDADDR:
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EINVAL;
 			break;
 		}
 		*addr = ia->ia_broadaddr;
 		break;
 
 	case SIOCGIFDSTADDR:
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
 			error = EINVAL;
 			break;
 		}
 		*addr = ia->ia_dstaddr;
 		break;
 
 	case SIOCGIFNETMASK:
 		*addr = ia->ia_sockmask;
 		break;
 	}
 
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 static int
 in_aifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct ucred *cred)
 {
 	const struct in_aliasreq *ifra = (struct in_aliasreq *)data;
 	const struct sockaddr_in *addr = &ifra->ifra_addr;
 	const struct sockaddr_in *broadaddr = &ifra->ifra_broadaddr;
 	const struct sockaddr_in *mask = &ifra->ifra_mask;
 	const struct sockaddr_in *dstaddr = &ifra->ifra_dstaddr;
 	const int vhid = (cmd == SIOCAIFADDR) ? ifra->ifra_vhid : 0;
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	bool iaIsFirst;
 	int error = 0;
 
 	error = priv_check_cred(cred, PRIV_NET_ADDIFADDR);
 	if (error)
 		return (error);
 
 	/*
 	 * ifra_addr must be present and be of INET family.
 	 * ifra_broadaddr/ifra_dstaddr and ifra_mask are optional.
 	 */
 	if (addr->sin_len != sizeof(struct sockaddr_in) ||
 	    addr->sin_family != AF_INET)
 		return (EINVAL);
 	if (broadaddr->sin_len != 0 &&
 	    (broadaddr->sin_len != sizeof(struct sockaddr_in) ||
 	    broadaddr->sin_family != AF_INET))
 		return (EINVAL);
 	if (mask->sin_len != 0 &&
 	    (mask->sin_len != sizeof(struct sockaddr_in) ||
 	    mask->sin_family != AF_INET))
 		return (EINVAL);
 	if ((ifp->if_flags & IFF_POINTOPOINT) &&
 	    (dstaddr->sin_len != sizeof(struct sockaddr_in) ||
 	     dstaddr->sin_addr.s_addr == INADDR_ANY))
 		return (EDESTADDRREQ);
 	if (vhid != 0 && carp_attach_p == NULL)
 		return (EPROTONOSUPPORT);
 
 	/*
 	 * See whether address already exist.
 	 */
 	iaIsFirst = true;
 	ia = NULL;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in_ifaddr *it;
 
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 
 		it = (struct in_ifaddr *)ifa;
 		if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr &&
 		    prison_check_ip4(cred, &addr->sin_addr) == 0)
 			ia = it;
 		else
 			iaIsFirst = false;
 	}
 	NET_EPOCH_EXIT(et);
 
 	if (ia != NULL)
 		(void )in_difaddr_ioctl(cmd, data, ifp, cred);
 
 	ifa = ifa_alloc(sizeof(struct in_ifaddr), M_WAITOK);
 	ia = (struct in_ifaddr *)ifa;
 	ifa->ifa_addr = (struct sockaddr *)&ia->ia_addr;
 	ifa->ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr;
 	ifa->ifa_netmask = (struct sockaddr *)&ia->ia_sockmask;
 	callout_init_rw(&ia->ia_garp_timer, &ifp->if_addr_lock,
 	    CALLOUT_RETURNUNLOCKED);
 
 	ia->ia_ifp = ifp;
 	ia->ia_addr = *addr;
 	if (mask->sin_len != 0) {
 		ia->ia_sockmask = *mask;
 		ia->ia_subnetmask = ntohl(ia->ia_sockmask.sin_addr.s_addr);
 	} else {
 		in_addr_t i = ntohl(addr->sin_addr.s_addr);
 
 		/*
 	 	 * If netmask isn't supplied, use historical default.
 		 * This is deprecated for interfaces other than loopback
 		 * or point-to-point; warn in other cases.  In the future
 		 * we should return an error rather than warning.
 	 	 */
 		if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) == 0)
 			printf("%s: set address: WARNING: network mask "
 			     "should be specified; using historical default\n",
 			     ifp->if_xname);
 		if (IN_CLASSA(i))
 			ia->ia_subnetmask = IN_CLASSA_NET;
 		else if (IN_CLASSB(i))
 			ia->ia_subnetmask = IN_CLASSB_NET;
 		else
 			ia->ia_subnetmask = IN_CLASSC_NET;
 		ia->ia_sockmask.sin_addr.s_addr = htonl(ia->ia_subnetmask);
 	}
 	ia->ia_subnet = ntohl(addr->sin_addr.s_addr) & ia->ia_subnetmask;
 	in_socktrim(&ia->ia_sockmask);
 
 	if (ifp->if_flags & IFF_BROADCAST) {
 		if (broadaddr->sin_len != 0) {
 			ia->ia_broadaddr = *broadaddr;
 		} else if (ia->ia_subnetmask == IN_RFC3021_MASK) {
 			ia->ia_broadaddr.sin_addr.s_addr = INADDR_BROADCAST;
 			ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in);
 			ia->ia_broadaddr.sin_family = AF_INET;
 		} else {
 			ia->ia_broadaddr.sin_addr.s_addr =
 			    htonl(ia->ia_subnet | ~ia->ia_subnetmask);
 			ia->ia_broadaddr.sin_len = sizeof(struct sockaddr_in);
 			ia->ia_broadaddr.sin_family = AF_INET;
 		}
 	}
 
 	if (ifp->if_flags & IFF_POINTOPOINT)
 		ia->ia_dstaddr = *dstaddr;
 
 	if (vhid != 0) {
 		error = (*carp_attach_p)(&ia->ia_ifa, vhid);
 		if (error)
 			return (error);
 	}
 
 	/* if_addrhead is already referenced by ifa_alloc() */
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 
 	ifa_ref(ifa);			/* in_ifaddrhead */
 	sx_assert(&in_control_sx, SA_XLOCKED);
 	CK_STAILQ_INSERT_TAIL(&V_in_ifaddrhead, ia, ia_link);
 	CK_LIST_INSERT_HEAD(INADDR_HASH(ia->ia_addr.sin_addr.s_addr), ia,
 	    ia_hash);
 
 	/*
 	 * Give the interface a chance to initialize
 	 * if this is its first address,
 	 * and to validate the address if necessary.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
 		if (error)
 			goto fail1;
 	}
 
 	/*
 	 * Add route for the network.
 	 */
 	if (vhid == 0) {
 		error = in_addprefix(ia);
 		if (error)
 			goto fail1;
 	}
 
 	/*
 	 * Add a loopback route to self.
 	 */
 	if (vhid == 0 && ia_need_loopback_route(ia)) {
 		struct in_ifaddr *eia;
 
 		eia = in_localip_more(ia);
 
 		if (eia == NULL) {
 			error = ifa_add_loopback_route((struct ifaddr *)ia,
 			    (struct sockaddr *)&ia->ia_addr);
 			if (error)
 				goto fail2;
 		} else
 			ifa_free(&eia->ia_ifa);
 	}
 
 	if (iaIsFirst && (ifp->if_flags & IFF_MULTICAST)) {
 		struct in_addr allhosts_addr;
 		struct in_ifinfo *ii;
 
 		ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
 		allhosts_addr.s_addr = htonl(INADDR_ALLHOSTS_GROUP);
 
 		error = in_joingroup(ifp, &allhosts_addr, NULL,
 			&ii->ii_allhosts);
 	}
 
 	/*
 	 * Note: we don't need extra reference for ifa, since we called
 	 * with sx lock held, and ifaddr can not be deleted in concurrent
 	 * thread.
 	 */
 	EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, ifa, IFADDR_EVENT_ADD);
 
 	return (error);
 
 fail2:
 	if (vhid == 0)
 		(void )in_scrubprefix(ia, LLE_STATIC);
 
 fail1:
 	if (ia->ia_ifa.ifa_carp)
 		(*carp_detach_p)(&ia->ia_ifa, false);
 
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 	ifa_free(&ia->ia_ifa);		/* if_addrhead */
 
 	sx_assert(&in_control_sx, SA_XLOCKED);
 	CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link);
 	CK_LIST_REMOVE(ia, ia_hash);
 	ifa_free(&ia->ia_ifa);		/* in_ifaddrhead */
 
 	return (error);
 }
 
 static int
 in_difaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct ucred *cred)
 {
 	const struct ifreq *ifr = (struct ifreq *)data;
 	const struct sockaddr_in *addr = (const struct sockaddr_in *)
 	    &ifr->ifr_addr;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	bool deleteAny, iaIsLast;
 	int error;
 
 	if (cred != NULL) {
 		error = priv_check_cred(cred, PRIV_NET_DELIFADDR);
 		if (error)
 			return (error);
 	}
 
 	if (addr->sin_len != sizeof(struct sockaddr_in) ||
 	    addr->sin_family != AF_INET)
 		deleteAny = true;
 	else
 		deleteAny = false;
 
 	iaIsLast = true;
 	ia = NULL;
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in_ifaddr *it;
 
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 
 		it = (struct in_ifaddr *)ifa;
 		if (deleteAny && ia == NULL && (cred == NULL ||
 		    prison_check_ip4(cred, &it->ia_addr.sin_addr) == 0))
 			ia = it;
 
 		if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr &&
 		    (cred == NULL || prison_check_ip4(cred,
 		    &addr->sin_addr) == 0))
 			ia = it;
 
 		if (it != ia)
 			iaIsLast = false;
 	}
 
 	if (ia == NULL) {
 		IF_ADDR_WUNLOCK(ifp);
 		return (EADDRNOTAVAIL);
 	}
 
 	CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 	ifa_free(&ia->ia_ifa);		/* if_addrhead */
 
 	sx_assert(&in_control_sx, SA_XLOCKED);
 	CK_STAILQ_REMOVE(&V_in_ifaddrhead, ia, in_ifaddr, ia_link);
 	CK_LIST_REMOVE(ia, ia_hash);
 
 	/*
 	 * in_scrubprefix() kills the interface route.
 	 */
 	in_scrubprefix(ia, LLE_STATIC);
 
 	/*
 	 * in_ifadown gets rid of all the rest of
 	 * the routes.  This is not quite the right
 	 * thing to do, but at least if we are running
 	 * a routing process they will come back.
 	 */
 	in_ifadown(&ia->ia_ifa, 1);
 
 	if (ia->ia_ifa.ifa_carp)
 		(*carp_detach_p)(&ia->ia_ifa, cmd == SIOCAIFADDR);
 
 	/*
 	 * If this is the last IPv4 address configured on this
 	 * interface, leave the all-hosts group.
 	 * No state-change report need be transmitted.
 	 */
 	if (iaIsLast && (ifp->if_flags & IFF_MULTICAST)) {
 		struct in_ifinfo *ii;
 
 		ii = ((struct in_ifinfo *)ifp->if_afdata[AF_INET]);
 		if (ii->ii_allhosts) {
 			(void)in_leavegroup(ii->ii_allhosts, NULL);
 			ii->ii_allhosts = NULL;
 		}
 	}
 
 	IF_ADDR_WLOCK(ifp);
 	if (callout_stop(&ia->ia_garp_timer) == 1) {
 		ifa_free(&ia->ia_ifa);
 	}
 	IF_ADDR_WUNLOCK(ifp);
 
 	EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa,
 	    IFADDR_EVENT_DEL);
 	ifa_free(&ia->ia_ifa);		/* in_ifaddrhead */
 
 	return (0);
 }
 
 static int
 in_gifaddr_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp, struct ucred *cred)
 {
 	struct in_aliasreq *ifra = (struct in_aliasreq *)data;
 	const struct sockaddr_in *addr = &ifra->ifra_addr;
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 
 	/*
 	 * ifra_addr must be present and be of INET family.
 	 */
 	if (addr->sin_len != sizeof(struct sockaddr_in) ||
 	    addr->sin_family != AF_INET)
 		return (EINVAL);
 
 	/*
 	 * See whether address exist.
 	 */
 	ia = NULL;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in_ifaddr *it;
 
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 
 		it = (struct in_ifaddr *)ifa;
 		if (it->ia_addr.sin_addr.s_addr == addr->sin_addr.s_addr &&
 		    prison_check_ip4(cred, &addr->sin_addr) == 0) {
 			ia = it;
 			break;
 		}
 	}
 	if (ia == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (EADDRNOTAVAIL);
 	}
 
 	ifra->ifra_mask = ia->ia_sockmask;
 	if ((ifp->if_flags & IFF_POINTOPOINT) &&
 	    ia->ia_dstaddr.sin_family == AF_INET)
 		ifra->ifra_dstaddr = ia->ia_dstaddr;
 	else if ((ifp->if_flags & IFF_BROADCAST) &&
 	    ia->ia_broadaddr.sin_family == AF_INET)
 		ifra->ifra_broadaddr = ia->ia_broadaddr;
 	else
 		memset(&ifra->ifra_broadaddr, 0,
 		    sizeof(ifra->ifra_broadaddr));
 
 	NET_EPOCH_EXIT(et);
 	return (0);
 }
 
 static int
 in_match_ifaddr(const struct rtentry *rt, const struct nhop_object *nh, void *arg)
 {
 
 	if (nh->nh_ifa == (struct ifaddr *)arg)
 		return (1);
 
 	return (0);
 }
 
 static int
 in_handle_prefix_route(uint32_t fibnum, int cmd,
     struct sockaddr_in *dst, struct sockaddr_in *netmask, struct ifaddr *ifa,
     struct ifnet *ifp)
 {
 
 	NET_EPOCH_ASSERT();
 
 	/* Prepare gateway */
 	struct sockaddr_dl_short sdl = {
 		.sdl_family = AF_LINK,
 		.sdl_len = sizeof(struct sockaddr_dl_short),
 		.sdl_type = ifa->ifa_ifp->if_type,
 		.sdl_index = ifa->ifa_ifp->if_index,
 	};
 
 	struct rt_addrinfo info = {
 		.rti_ifa = ifa,
 		.rti_ifp = ifp,
 		.rti_flags = RTF_PINNED | ((netmask != NULL) ? 0 : RTF_HOST),
 		.rti_info = {
 			[RTAX_DST] = (struct sockaddr *)dst,
 			[RTAX_NETMASK] = (struct sockaddr *)netmask,
 			[RTAX_GATEWAY] = (struct sockaddr *)&sdl,
 		},
 		/* Ensure we delete the prefix IFF prefix ifa matches */
 		.rti_filter = in_match_ifaddr,
 		.rti_filterdata = ifa,
 	};
 
 	return (rib_handle_ifaddr_info(fibnum, cmd, &info));
 }
 
 /*
  * Routing table interaction with interface addresses.
  *
  * In general, two types of routes needs to be installed:
  * a) "interface" or "prefix" route, telling user that the addresses
  *   behind the ifa prefix are reached directly.
  * b) "loopback" route installed for the ifa address, telling user that
  *   the address belongs to local system.
  *
  * Handling for (a) and (b) differs in multi-fib aspects, hence they
  *  are implemented in different functions below.
  *
  * The cases above may intersect - /32 interface aliases results in
  *  the same prefix produced by (a) and (b). This blurs the definition
  *  of the "loopback" route and complicate interactions. The interaction
  *  table is defined below. The case numbers are used in the multiple
  *  functions below to refer to the particular test case.
  *
  * There can be multiple options:
  * 1) Adding address with prefix on non-p2p/non-loopback interface.
  *  Example: 192.0.2.1/24. Action:
  *  * add "prefix" route towards 192.0.2.0/24 via @ia interface,
  *    using @ia as an address source.
  *  * add "loopback" route towards 192.0.2.1 via V_loif, saving
  *   @ia ifp in the gateway and using @ia as an address source.
  *
  * 2) Adding address with /32 mask to non-p2p/non-loopback interface.
  *  Example: 192.0.2.2/32. Action:
  *  * add "prefix" host route via V_loif, using @ia as an address source.
  *
  * 3) Adding address with or without prefix to p2p interface.
  *  Example: 10.0.0.1/24->10.0.0.2. Action:
  *  * add "prefix" host route towards 10.0.0.2 via this interface, using @ia
  *    as an address source. Note: no sense in installing full /24 as the interface
  *    is point-to-point.
  *  * add "loopback" route towards 10.0.9.1 via V_loif, saving
  *   @ia ifp in the gateway and using @ia as an address source.
  *
  * 4) Adding address with or without prefix to loopback interface.
  *  Example: 192.0.2.1/24. Action:
  *  * add "prefix" host route via @ia interface, using @ia as an address source.
  *    Note: Skip installing /24 prefix as it would introduce TTL loop
  *    for the traffic destined to these addresses.
  */
 
 /*
  * Checks if @ia needs to install loopback route to @ia address via
  *  ifa_maintain_loopback_route().
  *
  * Return true on success.
  */
 static bool
 ia_need_loopback_route(const struct in_ifaddr *ia)
 {
 	struct ifnet *ifp = ia->ia_ifp;
 
 	/* Case 4: Skip loopback interfaces */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (ia->ia_addr.sin_addr.s_addr == INADDR_ANY))
 		return (false);
 
 	/* Clash avoidance: Skip p2p interfaces with both addresses are equal */
 	if ((ifp->if_flags & IFF_POINTOPOINT) &&
 	    ia->ia_dstaddr.sin_addr.s_addr == ia->ia_addr.sin_addr.s_addr)
 		return (false);
 
 	/* Case 2: skip /32 prefixes */
 	if (!(ifp->if_flags & IFF_POINTOPOINT) &&
 	    (ia->ia_sockmask.sin_addr.s_addr == INADDR_BROADCAST))
 		return (false);
 
 	return (true);
 }
 
 /*
  * Calculate "prefix" route corresponding to @ia.
  */
 static void
 ia_getrtprefix(const struct in_ifaddr *ia, struct in_addr *prefix, struct in_addr *mask)
 {
 
 	if (ia->ia_ifp->if_flags & IFF_POINTOPOINT) {
 		/* Case 3: return host route for dstaddr */
 		*prefix = ia->ia_dstaddr.sin_addr;
 		mask->s_addr = INADDR_BROADCAST;
 	} else if (ia->ia_ifp->if_flags & IFF_LOOPBACK) {
 		/* Case 4: return host route for ifaddr */
 		*prefix = ia->ia_addr.sin_addr;
 		mask->s_addr = INADDR_BROADCAST;
 	} else {
 		/* Cases 1,2: return actual ia prefix */
 		*prefix = ia->ia_addr.sin_addr;
 		*mask = ia->ia_sockmask.sin_addr;
 		prefix->s_addr &= mask->s_addr;
 	}
 }
 
 /*
  * Adds or delete interface "prefix" route corresponding to @ifa.
  *  Returns 0 on success or errno.
  */
 static int
 in_handle_ifaddr_route(int cmd, struct in_ifaddr *ia)
 {
 	struct ifaddr *ifa = &ia->ia_ifa;
 	struct in_addr daddr, maddr;
 	struct sockaddr_in *pmask;
 	struct epoch_tracker et;
 	int error;
 
 	ia_getrtprefix(ia, &daddr, &maddr);
 
 	struct sockaddr_in mask = {
 		.sin_family = AF_INET,
 		.sin_len = sizeof(struct sockaddr_in),
 		.sin_addr = maddr,
 	};
 
 	pmask = (maddr.s_addr != INADDR_BROADCAST) ? &mask : NULL;
 
 	struct sockaddr_in dst = {
 		.sin_family = AF_INET,
 		.sin_len = sizeof(struct sockaddr_in),
 		.sin_addr.s_addr = daddr.s_addr & maddr.s_addr,
 	};
 
 	struct ifnet *ifp = ia->ia_ifp;
 
 	if ((maddr.s_addr == INADDR_BROADCAST) &&
 	    (!(ia->ia_ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)))) {
 		/* Case 2: host route on broadcast interface */
 		ifp = V_loif;
 	}
 
 	uint32_t fibnum = ifa->ifa_ifp->if_fib;
 	NET_EPOCH_ENTER(et);
 	error = in_handle_prefix_route(fibnum, cmd, &dst, pmask, ifa, ifp);
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * Check if we have a route for the given prefix already.
  */
 static bool
 in_hasrtprefix(struct in_ifaddr *target)
 {
 	struct epoch_tracker et;
 	struct in_ifaddr *ia;
 	struct in_addr prefix, mask, p, m;
 	bool result = false;
 
 	ia_getrtprefix(target, &prefix, &mask);
 
 	/* Look for an existing address with the same prefix, mask, and fib */
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 		ia_getrtprefix(ia, &p, &m);
 
 		if (prefix.s_addr != p.s_addr ||
 		    mask.s_addr != m.s_addr)
 			continue;
 
 		if (target->ia_ifp->if_fib != ia->ia_ifp->if_fib)
 			continue;
 
 		/*
 		 * If we got a matching prefix route inserted by other
 		 * interface address, we are done here.
 		 */
 		if (ia->ia_flags & IFA_ROUTE) {
 			result = true;
 			break;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (result);
 }
 
 int
 in_addprefix(struct in_ifaddr *target)
 {
 	int error;
 
 	if (in_hasrtprefix(target)) {
 		if (V_nosameprefix)
 			return (EEXIST);
 		else {
 			rt_addrmsg(RTM_ADD, &target->ia_ifa,
 			    target->ia_ifp->if_fib);
 			return (0);
 		}
 	}
 
 	/*
 	 * No-one seem to have this prefix route, so we try to insert it.
 	 */
 	rt_addrmsg(RTM_ADD, &target->ia_ifa, target->ia_ifp->if_fib);
 	error = in_handle_ifaddr_route(RTM_ADD, target);
 	if (!error)
 		target->ia_flags |= IFA_ROUTE;
 	return (error);
 }
 
 /*
  * Removes either all lle entries for given @ia, or lle
  * corresponding to @ia address.
  */
 static void
 in_scrubprefixlle(struct in_ifaddr *ia, int all, u_int flags)
 {
 	struct sockaddr_in addr, mask;
 	struct sockaddr *saddr, *smask;
 	struct ifnet *ifp;
 
 	saddr = (struct sockaddr *)&addr;
 	bzero(&addr, sizeof(addr));
 	addr.sin_len = sizeof(addr);
 	addr.sin_family = AF_INET;
 	smask = (struct sockaddr *)&mask;
 	bzero(&mask, sizeof(mask));
 	mask.sin_len = sizeof(mask);
 	mask.sin_family = AF_INET;
 	mask.sin_addr.s_addr = ia->ia_subnetmask;
 	ifp = ia->ia_ifp;
 
 	if (all) {
 		/*
 		 * Remove all L2 entries matching given prefix.
 		 * Convert address to host representation to avoid
 		 * doing this on every callback. ia_subnetmask is already
 		 * stored in host representation.
 		 */
 		addr.sin_addr.s_addr = ntohl(ia->ia_addr.sin_addr.s_addr);
 		lltable_prefix_free(AF_INET, saddr, smask, flags);
 	} else {
 		/* Remove interface address only */
 		addr.sin_addr.s_addr = ia->ia_addr.sin_addr.s_addr;
 		lltable_delete_addr(LLTABLE(ifp), LLE_IFADDR, saddr);
 	}
 }
 
 /*
  * If there is no other address in the system that can serve a route to the
  * same prefix, remove the route.  Hand over the route to the new address
  * otherwise.
  */
 int
 in_scrubprefix(struct in_ifaddr *target, u_int flags)
 {
 	struct epoch_tracker et;
 	struct in_ifaddr *ia;
 	struct in_addr prefix, mask, p, m;
 	int error = 0;
 
 	/*
 	 * Remove the loopback route to the interface address.
 	 */
 	if (ia_need_loopback_route(target) && (flags & LLE_STATIC)) {
 		struct in_ifaddr *eia;
 
 		eia = in_localip_more(target);
 
 		if (eia != NULL) {
 			error = ifa_switch_loopback_route((struct ifaddr *)eia,
 			    (struct sockaddr *)&target->ia_addr);
 			ifa_free(&eia->ia_ifa);
 		} else {
 			error = ifa_del_loopback_route((struct ifaddr *)target,
 			    (struct sockaddr *)&target->ia_addr);
 		}
 	}
 
 	ia_getrtprefix(target, &prefix, &mask);
 
 	if ((target->ia_flags & IFA_ROUTE) == 0) {
 		rt_addrmsg(RTM_DELETE, &target->ia_ifa, target->ia_ifp->if_fib);
 
 		/*
 		 * Removing address from !IFF_UP interface or
 		 * prefix which exists on other interface (along with route).
 		 * No entries should exist here except target addr.
 		 * Given that, delete this entry only.
 		 */
 		in_scrubprefixlle(target, 0, flags);
 		return (0);
 	}
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 		ia_getrtprefix(ia, &p, &m);
 
 		if (prefix.s_addr != p.s_addr ||
 		    mask.s_addr != m.s_addr)
 			continue;
 
 		if ((ia->ia_ifp->if_flags & IFF_UP) == 0)
 			continue;
 
 		/*
 		 * If we got a matching prefix address, move IFA_ROUTE and
 		 * the route itself to it.  Make sure that routing daemons
 		 * get a heads-up.
 		 */
 		if ((ia->ia_flags & IFA_ROUTE) == 0) {
 			ifa_ref(&ia->ia_ifa);
 			NET_EPOCH_EXIT(et);
 			error = in_handle_ifaddr_route(RTM_DELETE, target);
 			if (error == 0)
 				target->ia_flags &= ~IFA_ROUTE;
 			else
 				log(LOG_INFO, "in_scrubprefix: err=%d, old prefix delete failed\n",
 					error);
 			/* Scrub all entries IFF interface is different */
 			in_scrubprefixlle(target, target->ia_ifp != ia->ia_ifp,
 			    flags);
 			error = in_handle_ifaddr_route(RTM_ADD, ia);
 			if (error == 0)
 				ia->ia_flags |= IFA_ROUTE;
 			else
 				log(LOG_INFO, "in_scrubprefix: err=%d, new prefix add failed\n",
 					error);
 			ifa_free(&ia->ia_ifa);
 			return (error);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	/*
 	 * remove all L2 entries on the given prefix
 	 */
 	in_scrubprefixlle(target, 1, flags);
 
 	/*
 	 * As no-one seem to have this prefix, we can remove the route.
 	 */
 	rt_addrmsg(RTM_DELETE, &target->ia_ifa, target->ia_ifp->if_fib);
 	error = in_handle_ifaddr_route(RTM_DELETE, target);
 	if (error == 0)
 		target->ia_flags &= ~IFA_ROUTE;
 	else
 		log(LOG_INFO, "in_scrubprefix: err=%d, prefix delete failed\n", error);
 	return (error);
 }
 
 void
 in_ifscrub_all(void)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa, *nifa;
 	struct ifaliasreq ifr;
 
 	IFNET_RLOCK();
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		/* Cannot lock here - lock recursion. */
 		/* NET_EPOCH_ENTER(et); */
 		CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 
 			/*
 			 * This is ugly but the only way for legacy IP to
 			 * cleanly remove addresses and everything attached.
 			 */
 			bzero(&ifr, sizeof(ifr));
 			ifr.ifra_addr = *ifa->ifa_addr;
 			if (ifa->ifa_dstaddr)
 			ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
 			(void)in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr,
 			    ifp, NULL);
 		}
 		/* NET_EPOCH_EXIT(et); */
 		in_purgemaddrs(ifp);
 		igmp_domifdetach(ifp);
 	}
 	IFNET_RUNLOCK();
 }
 
 int
 in_ifaddr_broadcast(struct in_addr in, struct in_ifaddr *ia)
 {
 
 	return ((in.s_addr == ia->ia_broadaddr.sin_addr.s_addr ||
 	     /*
 	      * Optionally check for old-style (host 0) broadcast, but
 	      * taking into account that RFC 3021 obsoletes it.
 	      */
 	    (V_broadcast_lowest && ia->ia_subnetmask != IN_RFC3021_MASK &&
 	    ntohl(in.s_addr) == ia->ia_subnet)) &&
 	     /*
 	      * Check for an all one subnetmask. These
 	      * only exist when an interface gets a secondary
 	      * address.
 	      */
 	    ia->ia_subnetmask != (u_long)0xffffffff);
 }
 
 /*
  * Return 1 if the address might be a local broadcast address.
  */
 int
 in_broadcast(struct in_addr in, struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	int found;
 
 	NET_EPOCH_ASSERT();
 
 	if (in.s_addr == INADDR_BROADCAST ||
 	    in.s_addr == INADDR_ANY)
 		return (1);
 	if ((ifp->if_flags & IFF_BROADCAST) == 0)
 		return (0);
 	found = 0;
 	/*
 	 * Look through the list of addresses for a match
 	 * with a broadcast address.
 	 */
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_INET &&
 		    in_ifaddr_broadcast(in, (struct in_ifaddr *)ifa)) {
 			found = 1;
 			break;
 		}
 	return (found);
 }
 
 /*
  * On interface removal, clean up IPv4 data structures hung off of the ifnet.
  */
 void
 in_ifdetach(struct ifnet *ifp)
 {
 	IN_MULTI_LOCK();
 	in_pcbpurgeif0(&V_ripcbinfo, ifp);
 	in_pcbpurgeif0(&V_udbinfo, ifp);
 	in_pcbpurgeif0(&V_ulitecbinfo, ifp);
 	in_purgemaddrs(ifp);
 	IN_MULTI_UNLOCK();
 
 	/*
 	 * Make sure all multicast deletions invoking if_ioctl() are
 	 * completed before returning. Else we risk accessing a freed
 	 * ifnet structure pointer.
 	 */
 	inm_release_wait(NULL);
 }
 
 static void
 in_ifnet_event(void *arg __unused, struct ifnet *ifp, int event)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	struct in_ifaddr *ia;
 	int error;
 
 	NET_EPOCH_ENTER(et);
 	switch (event) {
 	case IFNET_EVENT_DOWN:
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = (struct in_ifaddr *)ifa;
 			if ((ia->ia_flags & IFA_ROUTE) == 0)
 				continue;
 			ifa_ref(ifa);
 			/*
 			 * in_scrubprefix() kills the interface route.
 			 */
 			in_scrubprefix(ia, 0);
 			/*
 			 * in_ifadown gets rid of all the rest of the
 			 * routes.  This is not quite the right thing
 			 * to do, but at least if we are running a
 			 * routing process they will come back.
 			 */
 			in_ifadown(ifa, 0);
 			ifa_free(ifa);
 		}
 		break;
 
 	case IFNET_EVENT_UP:
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = (struct in_ifaddr *)ifa;
 			if (ia->ia_flags & IFA_ROUTE)
 				continue;
 			ifa_ref(ifa);
 			error = ifa_del_loopback_route(ifa, ifa->ifa_addr);
 			rt_addrmsg(RTM_ADD, ifa, ifa->ifa_ifp->if_fib);
 			error = in_handle_ifaddr_route(RTM_ADD, ia);
 			if (error == 0)
 				ia->ia_flags |= IFA_ROUTE;
 			error = ifa_add_loopback_route(ifa, ifa->ifa_addr);
 			ifa_free(ifa);
 		}
 		break;
 	}
 	NET_EPOCH_EXIT(et);
 }
 EVENTHANDLER_DEFINE(ifnet_event, in_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
 
 /*
  * Delete all IPv4 multicast address records, and associated link-layer
  * multicast address records, associated with ifp.
  * XXX It looks like domifdetach runs AFTER the link layer cleanup.
  * XXX This should not race with ifma_protospec being set during
  * a new allocation, if it does, we have bigger problems.
  */
 static void
 in_purgemaddrs(struct ifnet *ifp)
 {
 	struct epoch_tracker	 et;
 	struct in_multi_head purgeinms;
 	struct in_multi		*inm;
 	struct ifmultiaddr	*ifma;
 
 	SLIST_INIT(&purgeinms);
 	IN_MULTI_LIST_LOCK();
 
 	/*
 	 * Extract list of in_multi associated with the detaching ifp
 	 * which the PF_INET layer is about to release.
 	 * We need to do this as IF_ADDR_LOCK() may be re-acquired
 	 * by code further down.
 	 */
 	IF_ADDR_WLOCK(ifp);
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = inm_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		inm_rele_locked(&purgeinms, inm);
 	}
 	NET_EPOCH_EXIT(et);
 	IF_ADDR_WUNLOCK(ifp);
 
 	inm_release_list_deferred(&purgeinms);
 	igmp_ifdetach(ifp);
 	IN_MULTI_LIST_UNLOCK();
 }
 
 struct in_llentry {
 	struct llentry		base;
 };
 
 #define	IN_LLTBL_DEFAULT_HSIZE	32
 #define	IN_LLTBL_HASH(k, h) \
 	(((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1))
 
 /*
  * Do actual deallocation of @lle.
  */
 static void
 in_lltable_destroy_lle_unlocked(epoch_context_t ctx)
 {
 	struct llentry *lle;
 
 	lle = __containerof(ctx, struct llentry, lle_epoch_ctx);
 	LLE_LOCK_DESTROY(lle);
 	LLE_REQ_DESTROY(lle);
 	free(lle, M_LLTABLE);
 }
 
 /*
  * Called by LLE_FREE_LOCKED when number of references
  * drops to zero.
  */
 static void
 in_lltable_destroy_lle(struct llentry *lle)
 {
 
 	LLE_WUNLOCK(lle);
 	NET_EPOCH_CALL(in_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx);
 }
 
 static struct llentry *
 in_lltable_new(struct in_addr addr4, u_int flags)
 {
 	struct in_llentry *lle;
 
 	lle = malloc(sizeof(struct in_llentry), M_LLTABLE, M_NOWAIT | M_ZERO);
 	if (lle == NULL)		/* NB: caller generates msg */
 		return NULL;
 
 	/*
 	 * For IPv4 this will trigger "arpresolve" to generate
 	 * an ARP request.
 	 */
 	lle->base.la_expire = time_uptime; /* mark expired */
 	lle->base.r_l3addr.addr4 = addr4;
 	lle->base.lle_refcnt = 1;
 	lle->base.lle_free = in_lltable_destroy_lle;
 	LLE_LOCK_INIT(&lle->base);
 	LLE_REQ_INIT(&lle->base);
 	callout_init(&lle->base.lle_timer, 1);
 
 	return (&lle->base);
 }
 
 #define IN_ARE_MASKED_ADDR_EQUAL(d, a, m)	(		\
 	((((d).s_addr ^ (a).s_addr) & (m).s_addr)) == 0 )
 
 static int
 in_lltable_match_prefix(const struct sockaddr *saddr,
     const struct sockaddr *smask, u_int flags, struct llentry *lle)
 {
 	struct in_addr addr, mask, lle_addr;
 
 	addr = ((const struct sockaddr_in *)saddr)->sin_addr;
 	mask = ((const struct sockaddr_in *)smask)->sin_addr;
 	lle_addr.s_addr = ntohl(lle->r_l3addr.addr4.s_addr);
 
 	if (IN_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0)
 		return (0);
 
 	if (lle->la_flags & LLE_IFADDR) {
 		/*
 		 * Delete LLE_IFADDR records IFF address & flag matches.
 		 * Note that addr is the interface address within prefix
 		 * being matched.
 		 * Note also we should handle 'ifdown' cases without removing
 		 * ifaddr macs.
 		 */
 		if (addr.s_addr == lle_addr.s_addr && (flags & LLE_STATIC) != 0)
 			return (1);
 		return (0);
 	}
 
 	/* flags & LLE_STATIC means deleting both dynamic and static entries */
 	if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))
 		return (1);
 
 	return (0);
 }
 
 static void
 in_lltable_free_entry(struct lltable *llt, struct llentry *lle)
 {
 	size_t pkts_dropped;
 
 	LLE_WLOCK_ASSERT(lle);
 	KASSERT(llt != NULL, ("lltable is NULL"));
 
 	/* Unlink entry from table if not already */
 	if ((lle->la_flags & LLE_LINKED) != 0) {
 		IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp);
 		lltable_unlink_entry(llt, lle);
 	}
 
 	/* Drop hold queue */
 	pkts_dropped = llentry_free(lle);
 	ARPSTAT_ADD(dropped, pkts_dropped);
 }
 
 static int
 in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr)
 {
 	struct nhop_object *nh;
 	struct in_addr addr;
 
 	KASSERT(l3addr->sa_family == AF_INET,
 	    ("sin_family %d", l3addr->sa_family));
 
 	addr = ((const struct sockaddr_in *)l3addr)->sin_addr;
 
 	nh = fib4_lookup(ifp->if_fib, addr, 0, NHR_NONE, 0);
 	if (nh == NULL)
 		return (EINVAL);
 
 	/*
 	 * If the gateway for an existing host route matches the target L3
 	 * address, which is a special route inserted by some implementation
 	 * such as MANET, and the interface is of the correct type, then
 	 * allow for ARP to proceed.
 	 */
 	if (nh->nh_flags & NHF_GATEWAY) {
 		if (!(nh->nh_flags & NHF_HOST) || nh->nh_ifp->if_type != IFT_ETHER ||
 		    (nh->nh_ifp->if_flags & (IFF_NOARP | IFF_STATICARP)) != 0 ||
 		    memcmp(nh->gw_sa.sa_data, l3addr->sa_data,
 		    sizeof(in_addr_t)) != 0) {
 			return (EINVAL);
 		}
 	}
 
 	/*
 	 * Make sure that at least the destination address is covered
 	 * by the route. This is for handling the case where 2 or more
 	 * interfaces have the same prefix. An incoming packet arrives
 	 * on one interface and the corresponding outgoing packet leaves
 	 * another interface.
 	 */
 	if ((nh->nh_ifp != ifp) && (nh->nh_flags & NHF_HOST) == 0) {
 		struct in_ifaddr *ia = (struct in_ifaddr *)ifaof_ifpforaddr(l3addr, ifp);
 		struct in_addr dst_addr, mask_addr;
 
 		if (ia == NULL)
 			return (EINVAL);
 
 		/*
 		 * ifaof_ifpforaddr() returns _best matching_ IFA.
 		 * It is possible that ifa prefix does not cover our address.
 		 * Explicitly verify and fail if that's the case.
 		 */
 		dst_addr = IA_SIN(ia)->sin_addr;
 		mask_addr.s_addr = htonl(ia->ia_subnetmask);
 
 		if (!IN_ARE_MASKED_ADDR_EQUAL(dst_addr, addr, mask_addr))
 			return (EINVAL);
 	}
 
 	return (0);
 }
 
 static inline uint32_t
 in_lltable_hash_dst(const struct in_addr dst, uint32_t hsize)
 {
 
 	return (IN_LLTBL_HASH(dst.s_addr, hsize));
 }
 
 static uint32_t
 in_lltable_hash(const struct llentry *lle, uint32_t hsize)
 {
 
 	return (in_lltable_hash_dst(lle->r_l3addr.addr4, hsize));
 }
 
 static void
 in_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
 {
 	struct sockaddr_in *sin;
 
 	sin = (struct sockaddr_in *)sa;
 	bzero(sin, sizeof(*sin));
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = lle->r_l3addr.addr4;
 }
 
 static inline struct llentry *
 in_lltable_find_dst(struct lltable *llt, struct in_addr dst)
 {
 	struct llentry *lle;
 	struct llentries *lleh;
 	u_int hashidx;
 
 	hashidx = in_lltable_hash_dst(dst, llt->llt_hsize);
 	lleh = &llt->lle_head[hashidx];
 	CK_LIST_FOREACH(lle, lleh, lle_next) {
 		if (lle->la_flags & LLE_DELETED)
 			continue;
 		if (lle->r_l3addr.addr4.s_addr == dst.s_addr)
 			break;
 	}
 
 	return (lle);
 }
 
 static void
 in_lltable_delete_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	lle->la_flags |= LLE_DELETED;
 	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
 #ifdef DIAGNOSTIC
 	log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
 #endif
 	llentry_free(lle);
 }
 
 static struct llentry *
 in_lltable_alloc(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr;
 	struct ifnet *ifp = llt->llt_ifp;
 	struct llentry *lle;
 	char linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	int lladdr_off;
 
 	KASSERT(l3addr->sa_family == AF_INET,
 	    ("sin_family %d", l3addr->sa_family));
 
 	/*
 	 * A route that covers the given address must have
 	 * been installed 1st because we are doing a resolution,
 	 * verify this.
 	 */
 	if (!(flags & LLE_IFADDR) &&
 	    in_lltable_rtcheck(ifp, flags, l3addr) != 0)
 		return (NULL);
 
 	lle = in_lltable_new(sin->sin_addr, flags);
 	if (lle == NULL) {
 		log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
 		return (NULL);
 	}
 	lle->la_flags = flags;
 	if (flags & LLE_STATIC)
 		lle->r_flags |= RLLE_VALID;
 	if ((flags & LLE_IFADDR) == LLE_IFADDR) {
 		linkhdrsize = LLE_MAX_LINKHDR;
 		if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp),
 		    linkhdr, &linkhdrsize, &lladdr_off) != 0) {
 			in_lltable_free_entry(llt, lle);
 			return (NULL);
 		}
 		lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize,
 		    lladdr_off);
 		lle->la_flags |= LLE_STATIC;
 		lle->r_flags |= (RLLE_VALID | RLLE_IFADDR);
 	}
 
 	return (lle);
 }
 
 /*
  * Return NULL if not found or marked for deletion.
  * If found return lle read locked.
  */
 static struct llentry *
 in_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr;
 	struct llentry *lle;
 
 	IF_AFDATA_LOCK_ASSERT(llt->llt_ifp);
 	KASSERT(l3addr->sa_family == AF_INET,
 	    ("sin_family %d", l3addr->sa_family));
 	KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) !=
 	    (LLE_UNLOCKED | LLE_EXCLUSIVE),
 	    ("wrong lle request flags: %#x", flags));
 
 	lle = in_lltable_find_dst(llt, sin->sin_addr);
 	if (lle == NULL)
 		return (NULL);
 	if (flags & LLE_UNLOCKED)
 		return (lle);
 
 	if (flags & LLE_EXCLUSIVE)
 		LLE_WLOCK(lle);
 	else
 		LLE_RLOCK(lle);
 
 	/*
 	 * If the afdata lock is not held, the LLE may have been unlinked while
 	 * we were blocked on the LLE lock.  Check for this case.
 	 */
 	if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) {
 		if (flags & LLE_EXCLUSIVE)
 			LLE_WUNLOCK(lle);
 		else
 			LLE_RUNLOCK(lle);
 		return (NULL);
 	}
 	return (lle);
 }
 
 static int
 in_lltable_dump_entry(struct lltable *llt, struct llentry *lle,
     struct sysctl_req *wr)
 {
 	struct ifnet *ifp = llt->llt_ifp;
 	/* XXX stack use */
 	struct {
 		struct rt_msghdr	rtm;
 		struct sockaddr_in	sin;
 		struct sockaddr_dl	sdl;
 	} arpc;
 	struct sockaddr_dl *sdl;
 	int error;
 
 	bzero(&arpc, sizeof(arpc));
 	/* skip deleted entries */
 	if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
 		return (0);
 	/* Skip if jailed and not a valid IP of the prison. */
 	lltable_fill_sa_entry(lle,(struct sockaddr *)&arpc.sin);
 	if (prison_if(wr->td->td_ucred, (struct sockaddr *)&arpc.sin) != 0)
 		return (0);
 	/*
 	 * produce a msg made of:
 	 *  struct rt_msghdr;
 	 *  struct sockaddr_in; (IPv4)
 	 *  struct sockaddr_dl;
 	 */
 	arpc.rtm.rtm_msglen = sizeof(arpc);
 	arpc.rtm.rtm_version = RTM_VERSION;
 	arpc.rtm.rtm_type = RTM_GET;
 	arpc.rtm.rtm_flags = RTF_UP;
 	arpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
 
 	/* publish */
 	if (lle->la_flags & LLE_PUB)
 		arpc.rtm.rtm_flags |= RTF_ANNOUNCE;
 
 	sdl = &arpc.sdl;
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_len = sizeof(*sdl);
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = ifp->if_type;
 	if ((lle->la_flags & LLE_VALID) == LLE_VALID) {
 		sdl->sdl_alen = ifp->if_addrlen;
 		bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
 	} else {
 		sdl->sdl_alen = 0;
 		bzero(LLADDR(sdl), ifp->if_addrlen);
 	}
 
 	arpc.rtm.rtm_rmx.rmx_expire =
 	    lle->la_flags & LLE_STATIC ? 0 : lle->la_expire;
 	arpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
 	if (lle->la_flags & LLE_STATIC)
 		arpc.rtm.rtm_flags |= RTF_STATIC;
 	if (lle->la_flags & LLE_IFADDR)
 		arpc.rtm.rtm_flags |= RTF_PINNED;
 	arpc.rtm.rtm_index = ifp->if_index;
 	error = SYSCTL_OUT(wr, &arpc, sizeof(arpc));
 
 	return (error);
 }
 
 static void
 in_lltable_post_resolved(struct lltable *llt, struct llentry *lle)
 {
 	struct ifnet *ifp = llt->llt_ifp;
 
 	/* gratuitous ARP */
 	if ((lle->la_flags & LLE_PUB) != 0)
 		arprequest(ifp, &lle->r_l3addr.addr4, &lle->r_l3addr.addr4,
 		    lle->ll_addr);
 }
 
 static struct lltable *
 in_lltattach(struct ifnet *ifp)
 {
 	struct lltable *llt;
 
 	llt = lltable_allocate_htbl(IN_LLTBL_DEFAULT_HSIZE);
  	llt->llt_af = AF_INET;
  	llt->llt_ifp = ifp;
 
 	llt->llt_lookup = in_lltable_lookup;
 	llt->llt_alloc_entry = in_lltable_alloc;
 	llt->llt_delete_entry = in_lltable_delete_entry;
 	llt->llt_dump_entry = in_lltable_dump_entry;
 	llt->llt_hash = in_lltable_hash;
 	llt->llt_fill_sa_entry = in_lltable_fill_sa_entry;
 	llt->llt_free_entry = in_lltable_free_entry;
 	llt->llt_match_prefix = in_lltable_match_prefix;
 	llt->llt_mark_used = llentry_mark_used;
 	llt->llt_post_resolved = in_lltable_post_resolved;
  	lltable_link(llt);
 
 	return (llt);
 }
 
 struct lltable *
 in_lltable_get(struct ifnet *ifp)
 {
 	struct lltable *llt = NULL;
 
 	void *afdata_ptr = ifp->if_afdata[AF_INET];
 	if (afdata_ptr != NULL)
 		llt = ((struct in_ifinfo *)afdata_ptr)->ii_llt;
 	return (llt);
 }
 
 void *
 in_domifattach(struct ifnet *ifp)
 {
 	struct in_ifinfo *ii;
 
 	ii = malloc(sizeof(struct in_ifinfo), M_IFADDR, M_WAITOK|M_ZERO);
 
 	ii->ii_llt = in_lltattach(ifp);
 	ii->ii_igmp = igmp_domifattach(ifp);
 
 	return (ii);
 }
 
 void
 in_domifdetach(struct ifnet *ifp, void *aux)
 {
 	struct in_ifinfo *ii = (struct in_ifinfo *)aux;
 
 	igmp_domifdetach(ifp);
 	lltable_free(ii->ii_llt);
 	free(ii, M_IFADDR);
 }
diff --git a/sys/netinet/in_fib.c b/sys/netinet/in_fib.c
index 6f0e95bcf117..d84997deb29b 100644
--- a/sys/netinet/in_fib.c
+++ b/sys/netinet/in_fib.c
@@ -1,334 +1,335 @@
 /*-
  * Copyright (c) 2015
  * 	Alexander V. Chernikov <melifaro@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/fib_algo.h>
 #include <net/route/nhop.h>
 #include <net/toeplitz.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_fib.h>
 
 #ifdef INET
 
 /* Verify struct route compatibility */
 /* Assert 'struct route_in' is compatible with 'struct route' */
 CHK_STRUCT_ROUTE_COMPAT(struct route_in, ro_dst4);
 
 #ifdef FIB_ALGO
 VNET_DEFINE(struct fib_dp *, inet_dp);
 #endif
 
 #ifdef ROUTE_MPATH
 struct _hash_5tuple_ipv4 {
 	struct in_addr src;
 	struct in_addr dst;
 	unsigned short src_port;
 	unsigned short dst_port;
 	char proto;
 	char spare[3];
 };
 _Static_assert(sizeof(struct _hash_5tuple_ipv4) == 16,
     "_hash_5tuple_ipv4 size is wrong");
 
 uint32_t
 fib4_calc_software_hash(struct in_addr src, struct in_addr dst,
     unsigned short src_port, unsigned short dst_port, char proto,
     uint32_t *phashtype)
 {
 	struct _hash_5tuple_ipv4 data;
 
 	data.src = src;
 	data.dst = dst;
 	data.src_port = src_port;
 	data.dst_port = dst_port;
 	data.proto = proto;
 	data.spare[0] = data.spare[1] = data.spare[2] = 0;
 
 	*phashtype = M_HASHTYPE_OPAQUE;
 
 	return (toeplitz_hash(MPATH_ENTROPY_KEY_LEN, mpath_entropy_key,
 	  sizeof(data), (uint8_t *)&data));
 }
 #endif
 
 /*
  * Looks up path in fib @fibnum specified by @dst.
  * Returns path nexthop on success. Nexthop is safe to use
  *  within the current network epoch. If longer lifetime is required,
  *  one needs to pass NHR_REF as a flag. This will return referenced
  *  nexthop.
  */
 #ifdef FIB_ALGO
 struct nhop_object *
 fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
     uint32_t flags, uint32_t flowid)
 {
 	struct nhop_object *nh;
 	struct fib_dp *dp = &V_inet_dp[fibnum];
 	struct flm_lookup_key key = {.addr4 = dst };
 
 	nh = dp->f(dp->arg, key, scopeid);
 	if (nh != NULL) {
 		nh = nhop_select(nh, flowid);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(nh->nh_ifp)) {
 			if (flags & NHR_REF)
 				nhop_ref_object(nh);
 			return (nh);
 		}
 	}
 	RTSTAT_INC(rts_unreach);
 	return (NULL);
 }
 #else
 struct nhop_object *
 fib4_lookup(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
     uint32_t flags, uint32_t flowid)
 {
 	RIB_RLOCK_TRACKER;
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct nhop_object *nh;
 
 	KASSERT((fibnum < rt_numfibs), ("fib4_lookup: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET);
 	if (rh == NULL)
 		return (NULL);
 
 	/* Prepare lookup key */
 	struct sockaddr_in sin4 = {
 		.sin_family = AF_INET,
 		.sin_len = sizeof(struct sockaddr_in),
 		.sin_addr = dst,
 	};
 
 	nh = NULL;
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		nh = nhop_select((RNTORT(rn))->rt_nhop, flowid);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(nh->nh_ifp)) {
 			if (flags & NHR_REF)
 				nhop_ref_object(nh);
 			RIB_RUNLOCK(rh);
 			return (nh);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	RTSTAT_INC(rts_unreach);
 	return (NULL);
 }
 #endif
 
 inline static int
 check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
     const struct ifnet *src_if)
 {
 
 	if (src_if != NULL && nh->nh_aifp == src_if) {
 		return (1);
 	}
 	if (src_if == NULL) {
 		if ((flags & NHR_NODEFAULT) == 0)
 			return (1);
 		else if ((nh->nh_flags & NHF_DEFAULT) == 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 check_urpf(struct nhop_object *nh, uint32_t flags,
     const struct ifnet *src_if)
 {
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh)) {
 		const struct weightened_nhop *wn;
 		uint32_t num_nhops;
 		wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
 			for (int i = 0; i < num_nhops; i++) {
 				if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0)
 				return (1);
 		}
 		return (0);
 	} else
 #endif
 		return (check_urpf_nhop(nh, flags, src_if));
 }
 
 #ifndef FIB_ALGO
 static struct nhop_object *
 lookup_nhop(uint32_t fibnum, struct in_addr dst, uint32_t scopeid)
 {
 	RIB_RLOCK_TRACKER;
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct nhop_object *nh;
 
 	KASSERT((fibnum < rt_numfibs), ("fib4_check_urpf: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET);
 	if (rh == NULL)
 		return (NULL);
 
 	/* Prepare lookup key */
 	struct sockaddr_in sin4;
 	memset(&sin4, 0, sizeof(sin4));
 	sin4.sin_len = sizeof(struct sockaddr_in);
 	sin4.sin_addr = dst;
 
 	nh = NULL;
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0))
 		nh = RNTORT(rn)->rt_nhop;
 	RIB_RUNLOCK(rh);
 
 	return (nh);
 }
 #endif
 
 /*
  * Performs reverse path forwarding lookup.
  * If @src_if is non-zero, verifies that at least 1 path goes via
  *   this interface.
  * If @src_if is zero, verifies that route exist.
  * if @flags contains NHR_NOTDEFAULT, do not consider default route.
  *
  * Returns 1 if route matching conditions is found, 0 otherwise.
  */
 int
 fib4_check_urpf(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
   uint32_t flags, const struct ifnet *src_if)
 {
 	struct nhop_object *nh;
 #ifdef FIB_ALGO
 	struct fib_dp *dp = &V_inet_dp[fibnum];
 	struct flm_lookup_key key = {.addr4 = dst };
 
 	nh = dp->f(dp->arg, key, scopeid);
 #else
 	nh = lookup_nhop(fibnum, dst, scopeid);
 #endif
 	if (nh != NULL)
 		return (check_urpf(nh, flags, src_if));
 
 	return (0);
 }
 
 /*
  * Function returning prefix match data along with the nexthop data.
  * Intended to be used by the control plane code.
  * Supported flags:
  *  NHR_UNLOCKED: do not lock radix during lookup.
  * Returns pointer to rtentry and raw nexthop in @rnd. Both rtentry
  *  and nexthop are safe to use within current epoch. Note:
  * Note: rnd_nhop can actually be the nexthop group.
  */
 struct rtentry *
 fib4_lookup_rt(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
     uint32_t flags, struct route_nhop_data *rnd)
 {
 	RIB_RLOCK_TRACKER;
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct rtentry *rt;
 
 	KASSERT((fibnum < rt_numfibs), ("fib4_lookup_rt: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET);
 	if (rh == NULL)
 		return (NULL);
 
 	/* Prepare lookup key */
 	struct sockaddr_in sin4 = {
 		.sin_family = AF_INET,
 		.sin_len = sizeof(struct sockaddr_in),
 		.sin_addr = dst,
 	};
 
 	rt = NULL;
 	if (!(flags & NHR_UNLOCKED))
 		RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin4, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rt = (struct rtentry *)rn;
 		rnd->rnd_nhop = rt->rt_nhop;
 		rnd->rnd_weight = rt->rt_weight;
 	}
 	if (!(flags & NHR_UNLOCKED))
 		RIB_RUNLOCK(rh);
 
 	return (rt);
 }
 
 struct nhop_object *
 fib4_lookup_debugnet(uint32_t fibnum, struct in_addr dst, uint32_t scopeid,
     uint32_t flags)
 {
 	struct rtentry *rt;
 	struct route_nhop_data rnd;
 
 	rt = fib4_lookup_rt(fibnum, dst, scopeid, NHR_UNLOCKED, &rnd);
 	if (rt != NULL) {
 		struct nhop_object *nh = nhop_select(rnd.rnd_nhop, 0);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(nh->nh_ifp))
 			return (nh);
 	}
 
 	return (NULL);
 }
 
 #endif
diff --git a/sys/netinet/in_gif.c b/sys/netinet/in_gif.c
index 6290de6cb31e..e43b4c645372 100644
--- a/sys/netinet/in_gif.c
+++ b/sys/netinet/in_gif.c
@@ -1,463 +1,464 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * Copyright (c) 2018 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in_gif.c,v 1.54 2001/05/14 14:02:16 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_ecn.h>
 #include <netinet/in_fib.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 
 #include <net/if_gif.h>
 
 #define GIF_TTL		30
 VNET_DEFINE_STATIC(int, ip_gif_ttl) = GIF_TTL;
 #define	V_ip_gif_ttl		VNET(ip_gif_ttl)
 SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_gif_ttl), 0, "Default TTL value for encapsulated packets");
 
 /*
  * We keep interfaces in a hash table using src+dst as key.
  * Interfaces with GIF_IGNORE_SOURCE flag are linked into plain list.
  */
 VNET_DEFINE_STATIC(struct gif_list *, ipv4_hashtbl) = NULL;
 VNET_DEFINE_STATIC(struct gif_list *, ipv4_srchashtbl) = NULL;
 VNET_DEFINE_STATIC(struct gif_list, ipv4_list) = CK_LIST_HEAD_INITIALIZER();
 #define	V_ipv4_hashtbl		VNET(ipv4_hashtbl)
 #define	V_ipv4_srchashtbl	VNET(ipv4_srchashtbl)
 #define	V_ipv4_list		VNET(ipv4_list)
 
 #define	GIF_HASH(src, dst)	(V_ipv4_hashtbl[\
     in_gif_hashval((src), (dst)) & (GIF_HASH_SIZE - 1)])
 #define	GIF_SRCHASH(src)	(V_ipv4_srchashtbl[\
     fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (GIF_HASH_SIZE - 1)])
 #define	GIF_HASH_SC(sc)		GIF_HASH((sc)->gif_iphdr->ip_src.s_addr,\
     (sc)->gif_iphdr->ip_dst.s_addr)
 static uint32_t
 in_gif_hashval(in_addr_t src, in_addr_t dst)
 {
 	uint32_t ret;
 
 	ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT);
 	return (fnv_32_buf(&dst, sizeof(dst), ret));
 }
 
 static int
 in_gif_checkdup(const struct gif_softc *sc, in_addr_t src, in_addr_t dst)
 {
 	struct gif_softc *tmp;
 
 	if (sc->gif_family == AF_INET &&
 	    sc->gif_iphdr->ip_src.s_addr == src &&
 	    sc->gif_iphdr->ip_dst.s_addr == dst)
 		return (EEXIST);
 
 	CK_LIST_FOREACH(tmp, &GIF_HASH(src, dst), chain) {
 		if (tmp == sc)
 			continue;
 		if (tmp->gif_iphdr->ip_src.s_addr == src &&
 		    tmp->gif_iphdr->ip_dst.s_addr == dst)
 			return (EADDRNOTAVAIL);
 	}
 	return (0);
 }
 
 /*
  * Check that ingress address belongs to local host.
  */
 static void
 in_gif_set_running(struct gif_softc *sc)
 {
 
 	if (in_localip(sc->gif_iphdr->ip_src))
 		GIF2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING;
 	else
 		GIF2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 }
 
 /*
  * ifaddr_event handler.
  * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent
  * source address spoofing.
  */
 static void
 in_gif_srcaddr(void *arg __unused, const struct sockaddr *sa,
     int event __unused)
 {
 	const struct sockaddr_in *sin;
 	struct gif_softc *sc;
 
 	/* Check that VNET is ready */
 	if (V_ipv4_hashtbl == NULL)
 		return;
 
 	NET_EPOCH_ASSERT();
 	sin = (const struct sockaddr_in *)sa;
 	CK_LIST_FOREACH(sc, &GIF_SRCHASH(sin->sin_addr.s_addr), srchash) {
 		if (sc->gif_iphdr->ip_src.s_addr != sin->sin_addr.s_addr)
 			continue;
 		in_gif_set_running(sc);
 	}
 }
 
 static void
 in_gif_attach(struct gif_softc *sc)
 {
 
 	if (sc->gif_options & GIF_IGNORE_SOURCE)
 		CK_LIST_INSERT_HEAD(&V_ipv4_list, sc, chain);
 	else
 		CK_LIST_INSERT_HEAD(&GIF_HASH_SC(sc), sc, chain);
 
 	CK_LIST_INSERT_HEAD(&GIF_SRCHASH(sc->gif_iphdr->ip_src.s_addr),
 	    sc, srchash);
 }
 
 int
 in_gif_setopts(struct gif_softc *sc, u_int options)
 {
 
 	/* NOTE: we are protected with gif_ioctl_sx lock */
 	MPASS(sc->gif_family == AF_INET);
 	MPASS(sc->gif_options != options);
 
 	if ((options & GIF_IGNORE_SOURCE) !=
 	    (sc->gif_options & GIF_IGNORE_SOURCE)) {
 		CK_LIST_REMOVE(sc, srchash);
 		CK_LIST_REMOVE(sc, chain);
 		sc->gif_options = options;
 		in_gif_attach(sc);
 	}
 	return (0);
 }
 
 int
 in_gif_ioctl(struct gif_softc *sc, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct epoch_tracker et;
 	struct sockaddr_in *dst, *src;
 	struct ip *ip;
 	int error;
 
 	/* NOTE: we are protected with gif_ioctl_sx lock */
 	error = EINVAL;
 	switch (cmd) {
 	case SIOCSIFPHYADDR:
 		src = &((struct in_aliasreq *)data)->ifra_addr;
 		dst = &((struct in_aliasreq *)data)->ifra_dstaddr;
 
 		/* sanity checks */
 		if (src->sin_family != dst->sin_family ||
 		    src->sin_family != AF_INET ||
 		    src->sin_len != dst->sin_len ||
 		    src->sin_len != sizeof(*src))
 			break;
 		if (src->sin_addr.s_addr == INADDR_ANY ||
 		    dst->sin_addr.s_addr == INADDR_ANY) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		if (V_ipv4_hashtbl == NULL) {
 			V_ipv4_hashtbl = gif_hashinit();
 			V_ipv4_srchashtbl = gif_hashinit();
 		}
 		error = in_gif_checkdup(sc, src->sin_addr.s_addr,
 		    dst->sin_addr.s_addr);
 		if (error == EADDRNOTAVAIL)
 			break;
 		if (error == EEXIST) {
 			/* Addresses are the same. Just return. */
 			error = 0;
 			break;
 		}
 		ip = malloc(sizeof(*ip), M_GIF, M_WAITOK | M_ZERO);
 		ip->ip_src.s_addr = src->sin_addr.s_addr;
 		ip->ip_dst.s_addr = dst->sin_addr.s_addr;
 		if (sc->gif_family != 0) {
 			/* Detach existing tunnel first */
 			CK_LIST_REMOVE(sc, srchash);
 			CK_LIST_REMOVE(sc, chain);
 			GIF_WAIT();
 			free(sc->gif_hdr, M_GIF);
 			/* XXX: should we notify about link state change? */
 		}
 		sc->gif_family = AF_INET;
 		sc->gif_iphdr = ip;
 		in_gif_attach(sc);
 		NET_EPOCH_ENTER(et);
 		in_gif_set_running(sc);
 		NET_EPOCH_EXIT(et);
 		break;
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 		if (sc->gif_family != AF_INET) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		src = (struct sockaddr_in *)&ifr->ifr_addr;
 		memset(src, 0, sizeof(*src));
 		src->sin_family = AF_INET;
 		src->sin_len = sizeof(*src);
 		src->sin_addr = (cmd == SIOCGIFPSRCADDR) ?
 		    sc->gif_iphdr->ip_src: sc->gif_iphdr->ip_dst;
 		error = prison_if(curthread->td_ucred, (struct sockaddr *)src);
 		if (error != 0)
 			memset(src, 0, sizeof(*src));
 		break;
 	}
 	return (error);
 }
 
 int
 in_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
 {
 	struct gif_softc *sc = ifp->if_softc;
 	struct ip *ip;
 	int len;
 
 	/* prepend new IP header */
 	NET_EPOCH_ASSERT();
 	len = sizeof(struct ip);
 #ifndef __NO_STRICT_ALIGNMENT
 	if (proto == IPPROTO_ETHERIP)
 		len += ETHERIP_ALIGN;
 #endif
 	M_PREPEND(m, len, M_NOWAIT);
 	if (m == NULL)
 		return (ENOBUFS);
 #ifndef __NO_STRICT_ALIGNMENT
 	if (proto == IPPROTO_ETHERIP) {
 		len = mtod(m, vm_offset_t) & 3;
 		KASSERT(len == 0 || len == ETHERIP_ALIGN,
 		    ("in_gif_output: unexpected misalignment"));
 		m->m_data += len;
 		m->m_len -= ETHERIP_ALIGN;
 	}
 #endif
 	ip = mtod(m, struct ip *);
 
 	MPASS(sc->gif_family == AF_INET);
 	bcopy(sc->gif_iphdr, ip, sizeof(struct ip));
 	ip->ip_p = proto;
 	/* version will be set in ip_output() */
 	ip->ip_ttl = V_ip_gif_ttl;
 	ip->ip_len = htons(m->m_pkthdr.len);
 	ip->ip_tos = ecn;
 
 	return (ip_output(m, NULL, NULL, 0, NULL, NULL));
 }
 
 static int
 in_gif_input(struct mbuf *m, int off, int proto, void *arg)
 {
 	struct gif_softc *sc = arg;
 	struct ifnet *gifp;
 	struct ip *ip;
 	uint8_t ecn;
 
 	NET_EPOCH_ASSERT();
 	if (sc == NULL) {
 		m_freem(m);
 		KMOD_IPSTAT_INC(ips_nogif);
 		return (IPPROTO_DONE);
 	}
 	gifp = GIF2IFP(sc);
 	if ((gifp->if_flags & IFF_UP) != 0) {
 		ip = mtod(m, struct ip *);
 		ecn = ip->ip_tos;
 		m_adj(m, off);
 		gif_input(m, gifp, proto, ecn);
 	} else {
 		m_freem(m);
 		KMOD_IPSTAT_INC(ips_nogif);
 	}
 	return (IPPROTO_DONE);
 }
 
 static int
 in_gif_lookup(const struct mbuf *m, int off, int proto, void **arg)
 {
 	const struct ip *ip;
 	struct gif_softc *sc;
 	int ret;
 
 	if (V_ipv4_hashtbl == NULL)
 		return (0);
 
 	NET_EPOCH_ASSERT();
 	ip = mtod(m, const struct ip *);
 	/*
 	 * NOTE: it is safe to iterate without any locking here, because softc
 	 * can be reclaimed only when we are not within net_epoch_preempt
 	 * section, but ip_encap lookup+input are executed in epoch section.
 	 */
 	ret = 0;
 	CK_LIST_FOREACH(sc, &GIF_HASH(ip->ip_dst.s_addr,
 	    ip->ip_src.s_addr), chain) {
 		/*
 		 * This is an inbound packet, its ip_dst is source address
 		 * in softc.
 		 */
 		if (sc->gif_iphdr->ip_src.s_addr == ip->ip_dst.s_addr &&
 		    sc->gif_iphdr->ip_dst.s_addr == ip->ip_src.s_addr) {
 			ret = ENCAP_DRV_LOOKUP;
 			goto done;
 		}
 	}
 	/*
 	 * No exact match.
 	 * Check the list of interfaces with GIF_IGNORE_SOURCE flag.
 	 */
 	CK_LIST_FOREACH(sc, &V_ipv4_list, chain) {
 		if (sc->gif_iphdr->ip_src.s_addr == ip->ip_dst.s_addr) {
 			ret = 32 + 8; /* src + proto */
 			goto done;
 		}
 	}
 	return (0);
 done:
 	if ((GIF2IFP(sc)->if_flags & IFF_UP) == 0)
 		return (0);
 	/* ingress filters on outer source */
 	if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0) {
 		if (fib4_check_urpf(sc->gif_fibnum, ip->ip_src, 0, NHR_NONE,
 					m->m_pkthdr.rcvif) == 0)
 			return (0);
 	}
 	*arg = sc;
 	return (ret);
 }
 
 static const struct srcaddrtab *ipv4_srcaddrtab;
 static struct {
 	const struct encap_config encap;
 	const struct encaptab *cookie;
 } ipv4_encap_cfg[] = {
 	{
 		.encap = {
 			.proto = IPPROTO_IPV4,
 			.min_length = 2 * sizeof(struct ip),
 			.exact_match = ENCAP_DRV_LOOKUP,
 			.lookup = in_gif_lookup,
 			.input = in_gif_input
 		},
 	},
 #ifdef INET6
 	{
 		.encap = {
 			.proto = IPPROTO_IPV6,
 			.min_length = sizeof(struct ip) +
 			    sizeof(struct ip6_hdr),
 			.exact_match = ENCAP_DRV_LOOKUP,
 			.lookup = in_gif_lookup,
 			.input = in_gif_input
 		},
 	},
 #endif
 	{
 		.encap = {
 			.proto = IPPROTO_ETHERIP,
 			.min_length = sizeof(struct ip) +
 			    sizeof(struct etherip_header) +
 			    sizeof(struct ether_header),
 			.exact_match = ENCAP_DRV_LOOKUP,
 			.lookup = in_gif_lookup,
 			.input = in_gif_input
 		},
 	}
 };
 
 void
 in_gif_init(void)
 {
 	int i;
 
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	ipv4_srcaddrtab = ip_encap_register_srcaddr(in_gif_srcaddr,
 	    NULL, M_WAITOK);
 	for (i = 0; i < nitems(ipv4_encap_cfg); i++)
 		ipv4_encap_cfg[i].cookie = ip_encap_attach(
 		    &ipv4_encap_cfg[i].encap, NULL, M_WAITOK);
 }
 
 void
 in_gif_uninit(void)
 {
 	int i;
 
 	if (IS_DEFAULT_VNET(curvnet)) {
 		for (i = 0; i < nitems(ipv4_encap_cfg); i++)
 			ip_encap_detach(ipv4_encap_cfg[i].cookie);
 		ip_encap_unregister_srcaddr(ipv4_srcaddrtab);
 	}
 	if (V_ipv4_hashtbl != NULL) {
 		gif_hashdestroy(V_ipv4_hashtbl);
 		V_ipv4_hashtbl = NULL;
 		GIF_WAIT();
 		gif_hashdestroy(V_ipv4_srchashtbl);
 	}
 }
diff --git a/sys/netinet/in_mcast.c b/sys/netinet/in_mcast.c
index b2bfce038088..bbedbf8ceaa3 100644
--- a/sys/netinet/in_mcast.c
+++ b/sys/netinet/in_mcast.c
@@ -1,3025 +1,3026 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Bruce Simpson.
  * Copyright (c) 2005 Robert N. M. Watson.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPv4 multicast socket, group, and socket option processing module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/sysctl.h>
 #include <sys/ktr.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <net/ethernet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
+#include <net/if_private.h>
 #include <netinet/ip_var.h>
 #include <netinet/igmp_var.h>
 
 #ifndef KTR_IGMPV3
 #define KTR_IGMPV3 KTR_INET
 #endif
 
 #ifndef __SOCKUNION_DECLARED
 union sockunion {
 	struct sockaddr_storage	ss;
 	struct sockaddr		sa;
 	struct sockaddr_dl	sdl;
 	struct sockaddr_in	sin;
 };
 typedef union sockunion sockunion_t;
 #define __SOCKUNION_DECLARED
 #endif /* __SOCKUNION_DECLARED */
 
 static MALLOC_DEFINE(M_INMFILTER, "in_mfilter",
     "IPv4 multicast PCB-layer source filter");
 static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group");
 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options");
 static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource",
     "IPv4 multicast IGMP-layer source filter");
 
 /*
  * Locking:
  *
  * - Lock order is: Giant, IN_MULTI_LOCK, INP_WLOCK,
  *   IN_MULTI_LIST_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
  * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however
  *   it can be taken by code in net/if.c also.
  * - ip_moptions and in_mfilter are covered by the INP_WLOCK.
  *
  * struct in_multi is covered by IN_MULTI_LIST_LOCK. There isn't strictly
  * any need for in_multi itself to be virtualized -- it is bound to an ifp
  * anyway no matter what happens.
  */
 struct mtx in_multi_list_mtx;
 MTX_SYSINIT(in_multi_mtx, &in_multi_list_mtx, "in_multi_list_mtx", MTX_DEF);
 
 struct mtx in_multi_free_mtx;
 MTX_SYSINIT(in_multi_free_mtx, &in_multi_free_mtx, "in_multi_free_mtx", MTX_DEF);
 
 struct sx in_multi_sx;
 SX_SYSINIT(in_multi_sx, &in_multi_sx, "in_multi_sx");
 
 /*
  * Functions with non-static linkage defined in this file should be
  * declared in in_var.h:
  *  imo_multi_filter()
  *  in_joingroup()
  *  in_joingroup_locked()
  *  in_leavegroup()
  *  in_leavegroup_locked()
  * and ip_var.h:
  *  inp_freemoptions()
  *  inp_getmoptions()
  *  inp_setmoptions()
  */
 static void	imf_commit(struct in_mfilter *);
 static int	imf_get_source(struct in_mfilter *imf,
 		    const struct sockaddr_in *psin,
 		    struct in_msource **);
 static struct in_msource *
 		imf_graft(struct in_mfilter *, const uint8_t,
 		    const struct sockaddr_in *);
 static void	imf_leave(struct in_mfilter *);
 static int	imf_prune(struct in_mfilter *, const struct sockaddr_in *);
 static void	imf_purge(struct in_mfilter *);
 static void	imf_rollback(struct in_mfilter *);
 static void	imf_reap(struct in_mfilter *);
 static struct in_mfilter *
 		imo_match_group(const struct ip_moptions *,
 		    const struct ifnet *, const struct sockaddr *);
 static struct in_msource *
 		imo_match_source(struct in_mfilter *, const struct sockaddr *);
 static void	ims_merge(struct ip_msource *ims,
 		    const struct in_msource *lims, const int rollback);
 static int	in_getmulti(struct ifnet *, const struct in_addr *,
 		    struct in_multi **);
 static int	inm_get_source(struct in_multi *inm, const in_addr_t haddr,
 		    const int noalloc, struct ip_msource **pims);
 #ifdef KTR
 static int	inm_is_ifp_detached(const struct in_multi *);
 #endif
 static int	inm_merge(struct in_multi *, /*const*/ struct in_mfilter *);
 static void	inm_purge(struct in_multi *);
 static void	inm_reap(struct in_multi *);
 static void inm_release(struct in_multi *);
 static struct ip_moptions *
 		inp_findmoptions(struct inpcb *);
 static int	inp_get_source_filters(struct inpcb *, struct sockopt *);
 static int	inp_join_group(struct inpcb *, struct sockopt *);
 static int	inp_leave_group(struct inpcb *, struct sockopt *);
 static struct ifnet *
 		inp_lookup_mcast_ifp(const struct inpcb *,
 		    const struct sockaddr_in *, const struct in_addr);
 static int	inp_block_unblock_source(struct inpcb *, struct sockopt *);
 static int	inp_set_multicast_if(struct inpcb *, struct sockopt *);
 static int	inp_set_source_filters(struct inpcb *, struct sockopt *);
 static int	sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS);
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPv4 multicast");
 
 static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER;
 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc,
     CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0,
     "Max source filters per group");
 
 static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER;
 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc,
     CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0,
     "Max source filters per socket");
 
 int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP;
 SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
     &in_mcast_loop, 0, "Loopback multicast datagrams by default");
 
 static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters,
     "Per-interface stack-wide source filters");
 
 #ifdef KTR
 /*
  * Inline function which wraps assertions for a valid ifp.
  * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
  * is detached.
  */
 static int __inline
 inm_is_ifp_detached(const struct in_multi *inm)
 {
 	struct ifnet *ifp;
 
 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->inm_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that netinet's notion of ifp is the
 		 * same as net's.
 		 */
 		KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	return (ifp == NULL);
 }
 #endif
 
 /*
  * Interface detach can happen in a taskqueue thread context, so we must use a
  * dedicated thread to avoid deadlocks when draining inm_release tasks.
  */
 TASKQUEUE_DEFINE_THREAD(inm_free);
 static struct in_multi_head inm_free_list = SLIST_HEAD_INITIALIZER();
 static void inm_release_task(void *arg __unused, int pending __unused);
 static struct task inm_free_task = TASK_INITIALIZER(0, inm_release_task, NULL);
 
 void
 inm_release_wait(void *arg __unused)
 {
 
 	/*
 	 * Make sure all pending multicast addresses are freed before
 	 * the VNET or network device is destroyed:
 	 */
 	taskqueue_drain(taskqueue_inm_free, &inm_free_task);
 }
 #ifdef VIMAGE
 /* XXX-BZ FIXME, see D24914. */
 VNET_SYSUNINIT(inm_release_wait, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, inm_release_wait, NULL);
 #endif
 
 void
 inm_release_list_deferred(struct in_multi_head *inmh)
 {
 
 	if (SLIST_EMPTY(inmh))
 		return;
 	mtx_lock(&in_multi_free_mtx);
 	SLIST_CONCAT(&inm_free_list, inmh, in_multi, inm_nrele);
 	mtx_unlock(&in_multi_free_mtx);
 	taskqueue_enqueue(taskqueue_inm_free, &inm_free_task);
 }
 
 void
 inm_disconnect(struct in_multi *inm)
 {
 	struct ifnet *ifp;
 	struct ifmultiaddr *ifma, *ll_ifma;
 
 	ifp = inm->inm_ifp;
 	IF_ADDR_WLOCK_ASSERT(ifp);
 	ifma = inm->inm_ifma;
 
 	if_ref(ifp);
 	if (ifma->ifma_flags & IFMA_F_ENQUEUED) {
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 	}
 	MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname);
 	if ((ll_ifma = ifma->ifma_llifma) != NULL) {
 		MPASS(ifma != ll_ifma);
 		ifma->ifma_llifma = NULL;
 		MPASS(ll_ifma->ifma_llifma == NULL);
 		MPASS(ll_ifma->ifma_ifp == ifp);
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
 				CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link);
 				ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 			}
 			MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname);
 			if_freemulti(ll_ifma);
 		}
 	}
 }
 
 void
 inm_release_deferred(struct in_multi *inm)
 {
 	struct in_multi_head tmp;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	MPASS(inm->inm_refcount > 0);
 	if (--inm->inm_refcount == 0) {
 		SLIST_INIT(&tmp);
 		inm_disconnect(inm);
 		inm->inm_ifma->ifma_protospec = NULL;
 		SLIST_INSERT_HEAD(&tmp, inm, inm_nrele);
 		inm_release_list_deferred(&tmp);
 	}
 }
 
 static void
 inm_release_task(void *arg __unused, int pending __unused)
 {
 	struct in_multi_head inm_free_tmp;
 	struct in_multi *inm, *tinm;
 
 	SLIST_INIT(&inm_free_tmp);
 	mtx_lock(&in_multi_free_mtx);
 	SLIST_CONCAT(&inm_free_tmp, &inm_free_list, in_multi, inm_nrele);
 	mtx_unlock(&in_multi_free_mtx);
 	IN_MULTI_LOCK();
 	SLIST_FOREACH_SAFE(inm, &inm_free_tmp, inm_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&inm_free_tmp, inm_nrele);
 		MPASS(inm);
 		inm_release(inm);
 	}
 	IN_MULTI_UNLOCK();
 }
 
 /*
  * Initialize an in_mfilter structure to a known state at t0, t1
  * with an empty source filter list.
  */
 static __inline void
 imf_init(struct in_mfilter *imf, const int st0, const int st1)
 {
 	memset(imf, 0, sizeof(struct in_mfilter));
 	RB_INIT(&imf->imf_sources);
 	imf->imf_st[0] = st0;
 	imf->imf_st[1] = st1;
 }
 
 struct in_mfilter *
 ip_mfilter_alloc(const int mflags, const int st0, const int st1)
 {
 	struct in_mfilter *imf;
 
 	imf = malloc(sizeof(*imf), M_INMFILTER, mflags);
 	if (imf != NULL)
 		imf_init(imf, st0, st1);
 
 	return (imf);
 }
 
 void
 ip_mfilter_free(struct in_mfilter *imf)
 {
 
 	imf_purge(imf);
 	free(imf, M_INMFILTER);
 }
 
 /*
  * Function for looking up an in_multi record for an IPv4 multicast address
  * on a given interface. ifp must be valid. If no record found, return NULL.
  * The IN_MULTI_LIST_LOCK and IF_ADDR_LOCK on ifp must be held.
  */
 struct in_multi *
 inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina)
 {
 	struct ifmultiaddr *ifma;
 	struct in_multi *inm;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	CK_STAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) {
 		inm = inm_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		if (inm->inm_addr.s_addr == ina.s_addr)
 			return (inm);
 	}
 	return (NULL);
 }
 
 /*
  * Wrapper for inm_lookup_locked().
  * The IF_ADDR_LOCK will be taken on ifp and released on return.
  */
 struct in_multi *
 inm_lookup(struct ifnet *ifp, const struct in_addr ina)
 {
 	struct epoch_tracker et;
 	struct in_multi *inm;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 	NET_EPOCH_ENTER(et);
 
 	inm = inm_lookup_locked(ifp, ina);
 	NET_EPOCH_EXIT(et);
 
 	return (inm);
 }
 
 /*
  * Find an IPv4 multicast group entry for this ip_moptions instance
  * which matches the specified group, and optionally an interface.
  * Return its index into the array, or -1 if not found.
  */
 static struct in_mfilter *
 imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group)
 {
 	const struct sockaddr_in *gsin;
 	struct in_mfilter *imf;
 	struct in_multi	*inm;
 
 	gsin = (const struct sockaddr_in *)group;
 
 	IP_MFILTER_FOREACH(imf, &imo->imo_head) {
 		inm = imf->imf_inm;
 		if (inm == NULL)
 			continue;
 		if ((ifp == NULL || (inm->inm_ifp == ifp)) &&
 		    in_hosteq(inm->inm_addr, gsin->sin_addr)) {
 			break;
 		}
 	}
 	return (imf);
 }
 
 /*
  * Find an IPv4 multicast source entry for this imo which matches
  * the given group index for this socket, and source address.
  *
  * NOTE: This does not check if the entry is in-mode, merely if
  * it exists, which may not be the desired behaviour.
  */
 static struct in_msource *
 imo_match_source(struct in_mfilter *imf, const struct sockaddr *src)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims;
 	const sockunion_t	*psa;
 
 	KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__));
 
 	/* Source trees are keyed in host byte order. */
 	psa = (const sockunion_t *)src;
 	find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 
 	return ((struct in_msource *)ims);
 }
 
 /*
  * Perform filtering for multicast datagrams on a socket by group and source.
  *
  * Returns 0 if a datagram should be allowed through, or various error codes
  * if the socket was not a member of the group, or the source was muted, etc.
  */
 int
 imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group, const struct sockaddr *src)
 {
 	struct in_mfilter *imf;
 	struct in_msource *ims;
 	int mode;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 
 	imf = imo_match_group(imo, ifp, group);
 	if (imf == NULL)
 		return (MCAST_NOTGMEMBER);
 
 	/*
 	 * Check if the source was included in an (S,G) join.
 	 * Allow reception on exclusive memberships by default,
 	 * reject reception on inclusive memberships by default.
 	 * Exclude source only if an in-mode exclude filter exists.
 	 * Include source only if an in-mode include filter exists.
 	 * NOTE: We are comparing group state here at IGMP t1 (now)
 	 * with socket-layer t0 (since last downcall).
 	 */
 	mode = imf->imf_st[1];
 	ims = imo_match_source(imf, src);
 
 	if ((ims == NULL && mode == MCAST_INCLUDE) ||
 	    (ims != NULL && ims->imsl_st[0] != mode))
 		return (MCAST_NOTSMEMBER);
 
 	return (MCAST_PASS);
 }
 
 /*
  * Find and return a reference to an in_multi record for (ifp, group),
  * and bump its reference count.
  * If one does not exist, try to allocate it, and update link-layer multicast
  * filters on ifp to listen for group.
  * Assumes the IN_MULTI lock is held across the call.
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in_getmulti(struct ifnet *ifp, const struct in_addr *group,
     struct in_multi **pinm)
 {
 	struct sockaddr_in	 gsin;
 	struct ifmultiaddr	*ifma;
 	struct in_ifinfo	*ii;
 	struct in_multi		*inm;
 	int error;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET];
 	IN_MULTI_LIST_LOCK();
 	inm = inm_lookup(ifp, *group);
 	if (inm != NULL) {
 		/*
 		 * If we already joined this group, just bump the
 		 * refcount and return it.
 		 */
 		KASSERT(inm->inm_refcount >= 1,
 		    ("%s: bad refcount %d", __func__, inm->inm_refcount));
 		inm_acquire_locked(inm);
 		*pinm = inm;
 	}
 	IN_MULTI_LIST_UNLOCK();
 	if (inm != NULL)
 		return (0);
 
 	memset(&gsin, 0, sizeof(gsin));
 	gsin.sin_family = AF_INET;
 	gsin.sin_len = sizeof(struct sockaddr_in);
 	gsin.sin_addr = *group;
 
 	/*
 	 * Check if a link-layer group is already associated
 	 * with this network-layer group on the given ifnet.
 	 */
 	error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma);
 	if (error != 0)
 		return (error);
 
 	/* XXX ifma_protospec must be covered by IF_ADDR_LOCK */
 	IN_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 
 	/*
 	 * If something other than netinet is occupying the link-layer
 	 * group, print a meaningful error message and back out of
 	 * the allocation.
 	 * Otherwise, bump the refcount on the existing network-layer
 	 * group association and return it.
 	 */
 	if (ifma->ifma_protospec != NULL) {
 		inm = (struct in_multi *)ifma->ifma_protospec;
 #ifdef INVARIANTS
 		KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
 		    __func__));
 		KASSERT(ifma->ifma_addr->sa_family == AF_INET,
 		    ("%s: ifma not AF_INET", __func__));
 		KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
 		if (inm->inm_ifma != ifma || inm->inm_ifp != ifp ||
 		    !in_hosteq(inm->inm_addr, *group)) {
 			char addrbuf[INET_ADDRSTRLEN];
 
 			panic("%s: ifma %p is inconsistent with %p (%s)",
 			    __func__, ifma, inm, inet_ntoa_r(*group, addrbuf));
 		}
 #endif
 		inm_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	IF_ADDR_WLOCK_ASSERT(ifp);
 
 	/*
 	 * A new in_multi record is needed; allocate and initialize it.
 	 * We DO NOT perform an IGMP join as the in_ layer may need to
 	 * push an initial source list down to IGMP to support SSM.
 	 *
 	 * The initial source filter state is INCLUDE, {} as per the RFC.
 	 */
 	inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO);
 	if (inm == NULL) {
 		IF_ADDR_WUNLOCK(ifp);
 		IN_MULTI_LIST_UNLOCK();
 		if_delmulti_ifma(ifma);
 		return (ENOMEM);
 	}
 	inm->inm_addr = *group;
 	inm->inm_ifp = ifp;
 	inm->inm_igi = ii->ii_igmp;
 	inm->inm_ifma = ifma;
 	inm->inm_refcount = 1;
 	inm->inm_state = IGMP_NOT_MEMBER;
 	mbufq_init(&inm->inm_scq, IGMP_MAX_STATE_CHANGES);
 	inm->inm_st[0].iss_fmode = MCAST_UNDEFINED;
 	inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 	RB_INIT(&inm->inm_srcs);
 
 	ifma->ifma_protospec = inm;
 
 	*pinm = inm;
  out_locked:
 	IF_ADDR_WUNLOCK(ifp);
 	IN_MULTI_LIST_UNLOCK();
 	return (0);
 }
 
 /*
  * Drop a reference to an in_multi record.
  *
  * If the refcount drops to 0, free the in_multi record and
  * delete the underlying link-layer membership.
  */
 static void
 inm_release(struct in_multi *inm)
 {
 	struct ifmultiaddr *ifma;
 	struct ifnet *ifp;
 
 	CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount);
 	MPASS(inm->inm_refcount == 0);
 	CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm);
 
 	ifma = inm->inm_ifma;
 	ifp = inm->inm_ifp;
 
 	/* XXX this access is not covered by IF_ADDR_LOCK */
 	CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma);
 	if (ifp != NULL) {
 		CURVNET_SET(ifp->if_vnet);
 		inm_purge(inm);
 		free(inm, M_IPMADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 		CURVNET_RESTORE();
 		if_rele(ifp);
 	} else {
 		inm_purge(inm);
 		free(inm, M_IPMADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 	}
 }
 
 /*
  * Clear recorded source entries for a group.
  * Used by the IGMP code. Caller must hold the IN_MULTI lock.
  * FIXME: Should reap.
  */
 void
 inm_clear_recorded(struct in_multi *inm)
 {
 	struct ip_msource	*ims;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 		if (ims->ims_stp) {
 			ims->ims_stp = 0;
 			--inm->inm_st[1].iss_rec;
 		}
 	}
 	KASSERT(inm->inm_st[1].iss_rec == 0,
 	    ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec));
 }
 
 /*
  * Record a source as pending for a Source-Group IGMPv3 query.
  * This lives here as it modifies the shared tree.
  *
  * inm is the group descriptor.
  * naddr is the address of the source to record in network-byte order.
  *
  * If the net.inet.igmp.sgalloc sysctl is non-zero, we will
  * lazy-allocate a source node in response to an SG query.
  * Otherwise, no allocation is performed. This saves some memory
  * with the trade-off that the source will not be reported to the
  * router if joined in the window between the query response and
  * the group actually being joined on the local host.
  *
  * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed.
  * This turns off the allocation of a recorded source entry if
  * the group has not been joined.
  *
  * Return 0 if the source didn't exist or was already marked as recorded.
  * Return 1 if the source was marked as recorded by this function.
  * Return <0 if any error occurred (negated errno code).
  */
 int
 inm_record_source(struct in_multi *inm, const in_addr_t naddr)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	find.ims_haddr = ntohl(naddr);
 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
 	if (ims && ims->ims_stp)
 		return (0);
 	if (ims == NULL) {
 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
 			return (-ENOSPC);
 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (-ENOMEM);
 		nims->ims_haddr = find.ims_haddr;
 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
 		++inm->inm_nsrc;
 		ims = nims;
 	}
 
 	/*
 	 * Mark the source as recorded and update the recorded
 	 * source count.
 	 */
 	++ims->ims_stp;
 	++inm->inm_st[1].iss_rec;
 
 	return (1);
 }
 
 /*
  * Return a pointer to an in_msource owned by an in_mfilter,
  * given its source address.
  * Lazy-allocate if needed. If this is a new entry its filter state is
  * undefined at t0.
  *
  * imf is the filter set being modified.
  * haddr is the source address in *host* byte-order.
  *
  * SMPng: May be called with locks held; malloc must not block.
  */
 static int
 imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin,
     struct in_msource **plims)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 	struct in_msource	*lims;
 	int			 error;
 
 	error = 0;
 	ims = NULL;
 	lims = NULL;
 
 	/* key is host byte order */
 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 	lims = (struct in_msource *)ims;
 	if (lims == NULL) {
 		if (imf->imf_nsrc == in_mcast_maxsocksrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct in_msource), M_INMFILTER,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		lims = (struct in_msource *)nims;
 		lims->ims_haddr = find.ims_haddr;
 		lims->imsl_st[0] = MCAST_UNDEFINED;
 		RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
 		++imf->imf_nsrc;
 	}
 
 	*plims = lims;
 
 	return (error);
 }
 
 /*
  * Graft a source entry into an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being in the new filter mode at t1.
  *
  * Return the pointer to the new node, otherwise return NULL.
  */
 static struct in_msource *
 imf_graft(struct in_mfilter *imf, const uint8_t st1,
     const struct sockaddr_in *psin)
 {
 	struct ip_msource	*nims;
 	struct in_msource	*lims;
 
 	nims = malloc(sizeof(struct in_msource), M_INMFILTER,
 	    M_NOWAIT | M_ZERO);
 	if (nims == NULL)
 		return (NULL);
 	lims = (struct in_msource *)nims;
 	lims->ims_haddr = ntohl(psin->sin_addr.s_addr);
 	lims->imsl_st[0] = MCAST_UNDEFINED;
 	lims->imsl_st[1] = st1;
 	RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
 	++imf->imf_nsrc;
 
 	return (lims);
 }
 
 /*
  * Prune a source entry from an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being left at t1, it is not freed.
  *
  * Return 0 if no error occurred, otherwise return an errno value.
  */
 static int
 imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	/* key is host byte order */
 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
 	if (ims == NULL)
 		return (ENOENT);
 	lims = (struct in_msource *)ims;
 	lims->imsl_st[1] = MCAST_UNDEFINED;
 	return (0);
 }
 
 /*
  * Revert socket-layer filter set deltas at t1 to t0 state.
  */
 static void
 imf_rollback(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 	struct in_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == lims->imsl_st[1]) {
 			/* no change at t1 */
 			continue;
 		} else if (lims->imsl_st[0] != MCAST_UNDEFINED) {
 			/* revert change to existing source at t1 */
 			lims->imsl_st[1] = lims->imsl_st[0];
 		} else {
 			/* revert source added t1 */
 			CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 			free(ims, M_INMFILTER);
 			imf->imf_nsrc--;
 		}
 	}
 	imf->imf_st[1] = imf->imf_st[0];
 }
 
 /*
  * Mark socket-layer filter set as INCLUDE {} at t1.
  */
 static void
 imf_leave(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		lims->imsl_st[1] = MCAST_UNDEFINED;
 	}
 	imf->imf_st[1] = MCAST_INCLUDE;
 }
 
 /*
  * Mark socket-layer filter set deltas as committed.
  */
 static void
 imf_commit(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		lims->imsl_st[0] = lims->imsl_st[1];
 	}
 	imf->imf_st[0] = imf->imf_st[1];
 }
 
 /*
  * Reap unreferenced sources from socket-layer filter set.
  */
 static void
 imf_reap(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 	struct in_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		lims = (struct in_msource *)ims;
 		if ((lims->imsl_st[0] == MCAST_UNDEFINED) &&
 		    (lims->imsl_st[1] == MCAST_UNDEFINED)) {
 			CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims);
 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 			free(ims, M_INMFILTER);
 			imf->imf_nsrc--;
 		}
 	}
 }
 
 /*
  * Purge socket-layer filter set.
  */
 static void
 imf_purge(struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
 		free(ims, M_INMFILTER);
 		imf->imf_nsrc--;
 	}
 	imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED;
 	KASSERT(RB_EMPTY(&imf->imf_sources),
 	    ("%s: imf_sources not empty", __func__));
 }
 
 /*
  * Look up a source filter entry for a multicast group.
  *
  * inm is the group descriptor to work with.
  * haddr is the host-byte-order IPv4 address to look up.
  * noalloc may be non-zero to suppress allocation of sources.
  * *pims will be set to the address of the retrieved or allocated source.
  *
  * SMPng: NOTE: may be called with locks held.
  * Return 0 if successful, otherwise return a non-zero error code.
  */
 static int
 inm_get_source(struct in_multi *inm, const in_addr_t haddr,
     const int noalloc, struct ip_msource **pims)
 {
 	struct ip_msource	 find;
 	struct ip_msource	*ims, *nims;
 
 	find.ims_haddr = haddr;
 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
 	if (ims == NULL && !noalloc) {
 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		nims->ims_haddr = haddr;
 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
 		++inm->inm_nsrc;
 		ims = nims;
 #ifdef KTR
 		CTR3(KTR_IGMPV3, "%s: allocated 0x%08x as %p", __func__,
 		    haddr, ims);
 #endif
 	}
 
 	*pims = ims;
 	return (0);
 }
 
 /*
  * Merge socket-layer source into IGMP-layer source.
  * If rollback is non-zero, perform the inverse of the merge.
  */
 static void
 ims_merge(struct ip_msource *ims, const struct in_msource *lims,
     const int rollback)
 {
 	int n = rollback ? -1 : 1;
 
 	if (lims->imsl_st[0] == MCAST_EXCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].ex -= n;
 	} else if (lims->imsl_st[0] == MCAST_INCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 in -= %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].in -= n;
 	}
 
 	if (lims->imsl_st[1] == MCAST_EXCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 ex += %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].ex += n;
 	} else if (lims->imsl_st[1] == MCAST_INCLUDE) {
 		CTR3(KTR_IGMPV3, "%s: t1 in += %d on 0x%08x",
 		    __func__, n, ims->ims_haddr);
 		ims->ims_st[1].in += n;
 	}
 }
 
 /*
  * Atomically update the global in_multi state, when a membership's
  * filter list is being updated in any way.
  *
  * imf is the per-inpcb-membership group filter pointer.
  * A fake imf may be passed for in-kernel consumers.
  *
  * XXX This is a candidate for a set-symmetric-difference style loop
  * which would eliminate the repeated lookup from root of ims nodes,
  * as they share the same key space.
  *
  * If any error occurred this function will back out of refcounts
  * and return a non-zero value.
  */
 static int
 inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	struct ip_msource	*ims, *nims;
 	struct in_msource	*lims;
 	int			 schanged, error;
 	int			 nsrc0, nsrc1;
 
 	schanged = 0;
 	error = 0;
 	nsrc1 = nsrc0 = 0;
 	IN_MULTI_LIST_LOCK_ASSERT();
 
 	/*
 	 * Update the source filters first, as this may fail.
 	 * Maintain count of in-mode filters at t0, t1. These are
 	 * used to work out if we transition into ASM mode or not.
 	 * Maintain a count of source filters whose state was
 	 * actually modified by this operation.
 	 */
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++;
 		if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++;
 		if (lims->imsl_st[0] == lims->imsl_st[1]) continue;
 		error = inm_get_source(inm, lims->ims_haddr, 0, &nims);
 		++schanged;
 		if (error)
 			break;
 		ims_merge(nims, lims, 0);
 	}
 	if (error) {
 		struct ip_msource *bims;
 
 		RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) {
 			lims = (struct in_msource *)ims;
 			if (lims->imsl_st[0] == lims->imsl_st[1])
 				continue;
 			(void)inm_get_source(inm, lims->ims_haddr, 1, &bims);
 			if (bims == NULL)
 				continue;
 			ims_merge(bims, lims, 1);
 		}
 		goto out_reap;
 	}
 
 	CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1",
 	    __func__, nsrc0, nsrc1);
 
 	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
 	if (imf->imf_st[0] == imf->imf_st[1] &&
 	    imf->imf_st[1] == MCAST_INCLUDE) {
 		if (nsrc1 == 0) {
 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
 			--inm->inm_st[1].iss_in;
 		}
 	}
 
 	/* Handle filter mode transition on socket. */
 	if (imf->imf_st[0] != imf->imf_st[1]) {
 		CTR3(KTR_IGMPV3, "%s: imf transition %d to %d",
 		    __func__, imf->imf_st[0], imf->imf_st[1]);
 
 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__);
 			--inm->inm_st[1].iss_ex;
 		} else if (imf->imf_st[0] == MCAST_INCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
 			--inm->inm_st[1].iss_in;
 		}
 
 		if (imf->imf_st[1] == MCAST_EXCLUDE) {
 			CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__);
 			inm->inm_st[1].iss_ex++;
 		} else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
 			CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__);
 			inm->inm_st[1].iss_in++;
 		}
 	}
 
 	/*
 	 * Track inm filter state in terms of listener counts.
 	 * If there are any exclusive listeners, stack-wide
 	 * membership is exclusive.
 	 * Otherwise, if only inclusive listeners, stack-wide is inclusive.
 	 * If no listeners remain, state is undefined at t1,
 	 * and the IGMP lifecycle for this group should finish.
 	 */
 	if (inm->inm_st[1].iss_ex > 0) {
 		CTR1(KTR_IGMPV3, "%s: transition to EX", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_EXCLUDE;
 	} else if (inm->inm_st[1].iss_in > 0) {
 		CTR1(KTR_IGMPV3, "%s: transition to IN", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_INCLUDE;
 	} else {
 		CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__);
 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
 	}
 
 	/* Decrement ASM listener count on transition out of ASM mode. */
 	if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
 		if ((imf->imf_st[1] != MCAST_EXCLUDE) ||
 		    (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) {
 			CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__);
 			--inm->inm_st[1].iss_asm;
 		}
 	}
 
 	/* Increment ASM listener count on transition to ASM mode. */
 	if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
 		CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__);
 		inm->inm_st[1].iss_asm++;
 	}
 
 	CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm);
 	inm_print(inm);
 
 out_reap:
 	if (schanged > 0) {
 		CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__);
 		inm_reap(inm);
 	}
 	return (error);
 }
 
 /*
  * Mark an in_multi's filter set deltas as committed.
  * Called by IGMP after a state change has been enqueued.
  */
 void
 inm_commit(struct in_multi *inm)
 {
 	struct ip_msource	*ims;
 
 	CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm);
 	CTR1(KTR_IGMPV3, "%s: pre commit:", __func__);
 	inm_print(inm);
 
 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 		ims->ims_st[0] = ims->ims_st[1];
 	}
 	inm->inm_st[0] = inm->inm_st[1];
 }
 
 /*
  * Reap unreferenced nodes from an in_multi's filter set.
  */
 static void
 inm_reap(struct in_multi *inm)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
 		if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 ||
 		    ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 ||
 		    ims->ims_stp != 0)
 			continue;
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
 		free(ims, M_IPMSOURCE);
 		inm->inm_nsrc--;
 	}
 }
 
 /*
  * Purge all source nodes from an in_multi's filter set.
  */
 static void
 inm_purge(struct in_multi *inm)
 {
 	struct ip_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
 		free(ims, M_IPMSOURCE);
 		inm->inm_nsrc--;
 	}
 }
 
 /*
  * Join a multicast group; unlocked entry point.
  *
  * SMPng: XXX: in_joingroup() is called from in_control() when Giant
  * is not held. Fortunately, ifp is unlikely to have been detached
  * at this point, so we assume it's OK to recurse.
  */
 int
 in_joingroup(struct ifnet *ifp, const struct in_addr *gina,
     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
 {
 	int error;
 
 	IN_MULTI_LOCK();
 	error = in_joingroup_locked(ifp, gina, imf, pinm);
 	IN_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Join a multicast group; real entry point.
  *
  * Only preserves atomicity at inm level.
  * NOTE: imf argument cannot be const due to sys/tree.h limitations.
  *
  * If the IGMP downcall fails, the group is not joined, and an error
  * code is returned.
  */
 int
 in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina,
     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
 {
 	struct in_mfilter	 timf;
 	struct in_multi		*inm;
 	int			 error;
 
 	IN_MULTI_LOCK_ASSERT();
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 
 	CTR4(KTR_IGMPV3, "%s: join 0x%08x on %p(%s))", __func__,
 	    ntohl(gina->s_addr), ifp, ifp->if_xname);
 
 	error = 0;
 	inm = NULL;
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		imf = &timf;
 	}
 
 	error = in_getmulti(ifp, gina, &inm);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__);
 		return (error);
 	}
 	IN_MULTI_LIST_LOCK();
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		goto out_inm_release;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to update source", __func__);
 		goto out_inm_release;
 	}
 
  out_inm_release:
 	if (error) {
 		CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
 		IF_ADDR_WLOCK(ifp);
 		inm_release_deferred(inm);
 		IF_ADDR_WUNLOCK(ifp);
 	} else {
 		*pinm = inm;
 	}
 	IN_MULTI_LIST_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; unlocked entry point.
  */
 int
 in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	int error;
 
 	IN_MULTI_LOCK();
 	error = in_leavegroup_locked(inm, imf);
 	IN_MULTI_UNLOCK();
 
 	return (error);
 }
 
 /*
  * Leave a multicast group; real entry point.
  * All source filters will be expunged.
  *
  * Only preserves atomicity at inm level.
  *
  * Holding the write lock for the INP which contains imf
  * is highly advisable. We can't assert for it as imf does not
  * contain a back-pointer to the owning inp.
  *
  * Note: This is not the same as inm_release(*) as this function also
  * makes a state change downcall into IGMP.
  */
 int
 in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
 {
 	struct in_mfilter	 timf;
 	int			 error;
 
 	IN_MULTI_LOCK_ASSERT();
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 
 	error = 0;
 
 	CTR5(KTR_IGMPV3, "%s: leave inm %p, 0x%08x/%s, imf %p", __func__,
 	    inm, ntohl(inm->inm_addr.s_addr),
 	    (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname),
 	    imf);
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
 		imf = &timf;
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 *
 	 * As this particular invocation should not cause any memory
 	 * to be allocated, and there is no opportunity to roll back
 	 * the transaction, it MUST NOT fail.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	CURVNET_SET(inm->inm_ifp->if_vnet);
 	error = igmp_change_state(inm);
 	IF_ADDR_WLOCK(inm->inm_ifp);
 	inm_release_deferred(inm);
 	IF_ADDR_WUNLOCK(inm->inm_ifp);
 	IN_MULTI_LIST_UNLOCK();
 	CURVNET_RESTORE();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
 	CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
 
 	return (error);
 }
 
 /*#ifndef BURN_BRIDGES*/
 
 /*
  * Block or unblock an ASM multicast source on an inpcb.
  * This implements the delta-based API described in RFC 3678.
  *
  * The delta-based API applies only to exclusive-mode memberships.
  * An IGMP downcall will be performed.
  *
  * SMPng: NOTE: Must take Giant as a join may create a new ifma.
  *
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker		 et;
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_msource		*ims;
 	struct in_multi			*inm;
 	uint16_t			 fmode;
 	int				 error, doblock;
 
 	ifp = NULL;
 	error = 0;
 	doblock = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 
 	switch (sopt->sopt_name) {
 	case IP_BLOCK_SOURCE:
 	case IP_UNBLOCK_SOURCE: {
 		struct ip_mreq_source	 mreqs;
 
 		error = sooptcopyin(sopt, &mreqs,
 		    sizeof(struct ip_mreq_source),
 		    sizeof(struct ip_mreq_source));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 
 		ssa->sin.sin_family = AF_INET;
 		ssa->sin.sin_len = sizeof(struct sockaddr_in);
 		ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 
 		if (!in_nullhost(mreqs.imr_interface)) {
 			NET_EPOCH_ENTER(et);
 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
 			/* XXXGL: ifref? */
 			NET_EPOCH_EXIT(et);
 		}
 		if (sopt->sopt_name == IP_BLOCK_SOURCE)
 			doblock = 1;
 
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 		break;
 	    }
 
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = sooptcopyin(sopt, &gsr,
 		    sizeof(struct group_source_req),
 		    sizeof(struct group_source_req));
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (ssa->sin.sin_family != AF_INET ||
 		    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		NET_EPOCH_EXIT(et);
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 
 		if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
 			doblock = 1;
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	IN_MULTI_LOCK();
 
 	/*
 	 * Check if we are actually a member of this group.
 	 */
 	imo = inp_findmoptions(inp);
 	imf = imo_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 	inm = imf->imf_inm;
 
 	/*
 	 * Attempting to use the delta-based API on an
 	 * non exclusive-mode membership is an error.
 	 */
 	fmode = imf->imf_st[0];
 	if (fmode != MCAST_EXCLUDE) {
 		error = EINVAL;
 		goto out_inp_locked;
 	}
 
 	/*
 	 * Deal with error cases up-front:
 	 *  Asked to block, but already blocked; or
 	 *  Asked to unblock, but nothing to unblock.
 	 * If adding a new block entry, allocate it.
 	 */
 	ims = imo_match_source(imf, &ssa->sa);
 	if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
 		CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent", __func__,
 		    ntohl(ssa->sin.sin_addr.s_addr), doblock ? "" : "not ");
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	if (doblock) {
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
 		ims = imf_graft(imf, fmode, &ssa->sin);
 		if (ims == NULL)
 			error = ENOMEM;
 	} else {
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
 		error = imf_prune(imf, &ssa->sin);
 	}
 
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__);
 		goto out_imf_rollback;
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		IN_MULTI_LIST_UNLOCK();
 		goto out_imf_rollback;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	IN_MULTI_LIST_UNLOCK();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
 out_imf_rollback:
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Given an inpcb, return its multicast options structure pointer.  Accepts
  * an unlocked inpcb pointer, but will return it locked.  May sleep.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  * SMPng: NOTE: Returns with the INP write lock held.
  */
 static struct ip_moptions *
 inp_findmoptions(struct inpcb *inp)
 {
 	struct ip_moptions	 *imo;
 
 	INP_WLOCK(inp);
 	if (inp->inp_moptions != NULL)
 		return (inp->inp_moptions);
 
 	INP_WUNLOCK(inp);
 
 	imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
 
 	imo->imo_multicast_ifp = NULL;
 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
 	imo->imo_multicast_vif = -1;
 	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
 	imo->imo_multicast_loop = in_mcast_loop;
 	STAILQ_INIT(&imo->imo_head);
 
 	INP_WLOCK(inp);
 	if (inp->inp_moptions != NULL) {
 		free(imo, M_IPMOPTS);
 		return (inp->inp_moptions);
 	}
 	inp->inp_moptions = imo;
 	return (imo);
 }
 
 void
 inp_freemoptions(struct ip_moptions *imo)
 {
 	struct in_mfilter *imf;
 	struct in_multi *inm;
 	struct ifnet *ifp;
 
 	if (imo == NULL)
 		return;
 
 	while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) {
 		ip_mfilter_remove(&imo->imo_head, imf);
 
 		imf_leave(imf);
 		if ((inm = imf->imf_inm) != NULL) {
 			if ((ifp = inm->inm_ifp) != NULL) {
 				CURVNET_SET(ifp->if_vnet);
 				(void)in_leavegroup(inm, imf);
 				CURVNET_RESTORE();
 			} else {
 				(void)in_leavegroup(inm, imf);
 			}
 		}
 		ip_mfilter_free(imf);
 	}
 	free(imo, M_IPMOPTS);
 }
 
 /*
  * Atomically get source filters on a socket for an IPv4 multicast group.
  * Called with INP lock held; returns with lock released.
  */
 static int
 inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker	 et;
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct ip_moptions	*imo;
 	struct in_mfilter	*imf;
 	struct ip_msource	*ims;
 	struct in_msource	*lims;
 	struct sockaddr_in	*psin;
 	struct sockaddr_storage	*ptss;
 	struct sockaddr_storage	*tss;
 	int			 error;
 	size_t			 nsrcs, ncsrcs;
 
 	INP_WLOCK_ASSERT(inp);
 
 	imo = inp->inp_moptions;
 	KASSERT(imo != NULL, ("%s: null ip_moptions", __func__));
 
 	INP_WUNLOCK(inp);
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifnet pointer left */
 	if (ifp == NULL)
 		return (EINVAL);
 
 	INP_WLOCK(inp);
 
 	/*
 	 * Lookup group on the socket.
 	 */
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	imf = imo_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		INP_WUNLOCK(inp);
 		return (EADDRNOTAVAIL);
 	}
 
 	/*
 	 * Ignore memberships which are in limbo.
 	 */
 	if (imf->imf_st[1] == MCAST_UNDEFINED) {
 		INP_WUNLOCK(inp);
 		return (EAGAIN);
 	}
 	msfr.msfr_fmode = imf->imf_st[1];
 
 	/*
 	 * If the user specified a buffer, copy out the source filter
 	 * entries to userland gracefully.
 	 * We only copy out the number of entries which userland
 	 * has asked for, but we always tell userland how big the
 	 * buffer really needs to be.
 	 */
 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
 		msfr.msfr_nsrcs = in_mcast_maxsocksrc;
 	tss = NULL;
 	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
 		tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_NOWAIT | M_ZERO);
 		if (tss == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Count number of sources in-mode at t0.
 	 * If buffer space exists and remains, copy out source entries.
 	 */
 	nsrcs = msfr.msfr_nsrcs;
 	ncsrcs = 0;
 	ptss = tss;
 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
 		lims = (struct in_msource *)ims;
 		if (lims->imsl_st[0] == MCAST_UNDEFINED ||
 		    lims->imsl_st[0] != imf->imf_st[0])
 			continue;
 		++ncsrcs;
 		if (tss != NULL && nsrcs > 0) {
 			psin = (struct sockaddr_in *)ptss;
 			psin->sin_family = AF_INET;
 			psin->sin_len = sizeof(struct sockaddr_in);
 			psin->sin_addr.s_addr = htonl(lims->ims_haddr);
 			psin->sin_port = 0;
 			++ptss;
 			--nsrcs;
 		}
 	}
 
 	INP_WUNLOCK(inp);
 
 	if (tss != NULL) {
 		error = copyout(tss, msfr.msfr_srcs,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		free(tss, M_TEMP);
 		if (error)
 			return (error);
 	}
 
 	msfr.msfr_nsrcs = ncsrcs;
 	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
 
 	return (error);
 }
 
 /*
  * Return the IP multicast options in response to user getsockopt().
  */
 int
 inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip_mreqn		 mreqn;
 	struct ip_moptions	*imo;
 	struct ifnet		*ifp;
 	struct in_ifaddr	*ia;
 	int			 error, optval;
 	u_char			 coptval;
 
 	INP_WLOCK(inp);
 	imo = inp->inp_moptions;
 	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM reject it. */
 	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case IP_MULTICAST_VIF:
 		if (imo != NULL)
 			optval = imo->imo_multicast_vif;
 		else
 			optval = -1;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MULTICAST_IF:
 		memset(&mreqn, 0, sizeof(struct ip_mreqn));
 		if (imo != NULL) {
 			ifp = imo->imo_multicast_ifp;
 			if (!in_nullhost(imo->imo_multicast_addr)) {
 				mreqn.imr_address = imo->imo_multicast_addr;
 			} else if (ifp != NULL) {
 				struct epoch_tracker et;
 
 				mreqn.imr_ifindex = ifp->if_index;
 				NET_EPOCH_ENTER(et);
 				IFP_TO_IA(ifp, ia);
 				if (ia != NULL)
 					mreqn.imr_address =
 					    IA_SIN(ia)->sin_addr;
 				NET_EPOCH_EXIT(et);
 			}
 		}
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
 			error = sooptcopyout(sopt, &mreqn,
 			    sizeof(struct ip_mreqn));
 		} else {
 			error = sooptcopyout(sopt, &mreqn.imr_address,
 			    sizeof(struct in_addr));
 		}
 		break;
 
 	case IP_MULTICAST_TTL:
 		if (imo == NULL)
 			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
 		else
 			optval = coptval = imo->imo_multicast_ttl;
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(u_char))
 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
 		else
 			error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MULTICAST_LOOP:
 		if (imo == NULL)
 			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
 		else
 			optval = coptval = imo->imo_multicast_loop;
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize == sizeof(u_char))
 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
 		else
 			error = sooptcopyout(sopt, &optval, sizeof(int));
 		break;
 
 	case IP_MSFILTER:
 		if (imo == NULL) {
 			error = EADDRNOTAVAIL;
 			INP_WUNLOCK(inp);
 		} else {
 			error = inp_get_source_filters(inp, sopt);
 		}
 		break;
 
 	default:
 		INP_WUNLOCK(inp);
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Look up the ifnet to use for a multicast group membership,
  * given the IPv4 address of an interface, and the IPv4 group address.
  *
  * This routine exists to support legacy multicast applications
  * which do not understand that multicast memberships are scoped to
  * specific physical links in the networking stack, or which need
  * to join link-scope groups before IPv4 addresses are configured.
  *
  * Use this socket's current FIB number for any required FIB lookup.
  * If ina is INADDR_ANY, look up the group address in the unicast FIB,
  * and use its ifp; usually, this points to the default next-hop.
  *
  * If the FIB lookup fails, attempt to use the first non-loopback
  * interface with multicast capability in the system as a
  * last resort. The legacy IPv4 ASM API requires that we do
  * this in order to allow groups to be joined when the routing
  * table has not yet been populated during boot.
  *
  * Returns NULL if no ifp could be found, otherwise return referenced ifp.
  *
  * FUTURE: Implement IPv4 source-address selection.
  */
 static struct ifnet *
 inp_lookup_mcast_ifp(const struct inpcb *inp,
     const struct sockaddr_in *gsin, const struct in_addr ina)
 {
 	struct ifnet *ifp;
 	struct nhop_object *nh;
 
 	NET_EPOCH_ASSERT();
 	KASSERT(inp != NULL, ("%s: inp must not be NULL", __func__));
 	KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__));
 	KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)),
 	    ("%s: not multicast", __func__));
 
 	ifp = NULL;
 	if (!in_nullhost(ina)) {
 		INADDR_TO_IFP(ina, ifp);
 		if (ifp != NULL)
 			if_ref(ifp);
 	} else {
 		nh = fib4_lookup(inp->inp_inc.inc_fibnum, gsin->sin_addr, 0, NHR_NONE, 0);
 		if (nh != NULL) {
 			ifp = nh->nh_ifp;
 			if_ref(ifp);
 		} else {
 			struct in_ifaddr *ia;
 			struct ifnet *mifp;
 
 			mifp = NULL;
 			CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 				mifp = ia->ia_ifp;
 				if (!(mifp->if_flags & IFF_LOOPBACK) &&
 				     (mifp->if_flags & IFF_MULTICAST)) {
 					ifp = mifp;
 					if_ref(ifp);
 					break;
 				}
 			}
 		}
 	}
 
 	return (ifp);
 }
 
 /*
  * Join an IPv4 multicast group, possibly with a source.
  */
 static int
 inp_join_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_multi			*inm;
 	struct in_msource		*lims;
 	struct epoch_tracker		 et;
 	int				 error, is_new;
 
 	ifp = NULL;
 	lims = NULL;
 	error = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	switch (sopt->sopt_name) {
 	case IP_ADD_MEMBERSHIP: {
 		struct ip_mreqn mreqn;
 
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn))
 			error = sooptcopyin(sopt, &mreqn,
 			    sizeof(struct ip_mreqn), sizeof(struct ip_mreqn));
 		else
 			error = sooptcopyin(sopt, &mreqn,
 			    sizeof(struct ip_mreq), sizeof(struct ip_mreq));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqn.imr_multiaddr;
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		NET_EPOCH_ENTER(et);
 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn) &&
 		    mreqn.imr_ifindex != 0)
 			ifp = ifnet_byindex_ref(mreqn.imr_ifindex);
 		else
 			ifp = inp_lookup_mcast_ifp(inp, &gsa->sin,
 			    mreqn.imr_address);
 		NET_EPOCH_EXIT(et);
 		break;
 	}
 	case IP_ADD_SOURCE_MEMBERSHIP: {
 		struct ip_mreq_source	 mreqs;
 
 		error = sooptcopyin(sopt, &mreqs, sizeof(struct ip_mreq_source),
 			    sizeof(struct ip_mreq_source));
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = ssa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = ssa->sin.sin_len =
 		    sizeof(struct sockaddr_in);
 
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 
 		NET_EPOCH_ENTER(et);
 		ifp = inp_lookup_mcast_ifp(inp, &gsa->sin,
 		    mreqs.imr_interface);
 		NET_EPOCH_EXIT(et);
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 		break;
 	}
 
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_JOIN_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		/*
 		 * Overwrite the port field if present, as the sockaddr
 		 * being copied in may be matched with a binary comparison.
 		 */
 		gsa->sin.sin_port = 0;
 		if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			if (ssa->sin.sin_family != AF_INET ||
 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 				return (EINVAL);
 			ssa->sin.sin_port = 0;
 		}
 
 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 			return (EINVAL);
 
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex_ref(gsr.gsr_interface);
 		NET_EPOCH_EXIT(et);
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
 		if (ifp != NULL)
 			if_rele(ifp);
 		return (EADDRNOTAVAIL);
 	}
 
 	IN_MULTI_LOCK();
 
 	/*
 	 * Find the membership in the membership list.
 	 */
 	imo = inp_findmoptions(inp);
 	imf = imo_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		is_new = 1;
 		inm = NULL;
 
 		if (ip_mfilter_count(&imo->imo_head) >= IP_MAX_MEMBERSHIPS) {
 			error = ENOMEM;
 			goto out_inp_locked;
 		}
 	} else {
 		is_new = 0;
 		inm = imf->imf_inm;
 
 		if (ssa->ss.ss_family != AF_UNSPEC) {
 			/*
 			 * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
 			 * is an error. On an existing inclusive membership,
 			 * it just adds the source to the filter list.
 			 */
 			if (imf->imf_st[1] != MCAST_INCLUDE) {
 				error = EINVAL;
 				goto out_inp_locked;
 			}
 			/*
 			 * Throw out duplicates.
 			 *
 			 * XXX FIXME: This makes a naive assumption that
 			 * even if entries exist for *ssa in this imf,
 			 * they will be rejected as dupes, even if they
 			 * are not valid in the current mode (in-mode).
 			 *
 			 * in_msource is transactioned just as for anything
 			 * else in SSM -- but note naive use of inm_graft()
 			 * below for allocating new filter entries.
 			 *
 			 * This is only an issue if someone mixes the
 			 * full-state SSM API with the delta-based API,
 			 * which is discouraged in the relevant RFCs.
 			 */
 			lims = imo_match_source(imf, &ssa->sa);
 			if (lims != NULL /*&&
 			    lims->imsl_st[1] == MCAST_INCLUDE*/) {
 				error = EADDRNOTAVAIL;
 				goto out_inp_locked;
 			}
 		} else {
 			/*
 			 * MCAST_JOIN_GROUP on an existing exclusive
 			 * membership is an error; return EADDRINUSE
 			 * to preserve 4.4BSD API idempotence, and
 			 * avoid tedious detour to code below.
 			 * NOTE: This is bending RFC 3678 a bit.
 			 *
 			 * On an existing inclusive membership, this is also
 			 * an error; if you want to change filter mode,
 			 * you must use the userland API setsourcefilter().
 			 * XXX We don't reject this for imf in UNDEFINED
 			 * state at t1, because allocation of a filter
 			 * is atomic with allocation of a membership.
 			 */
 			error = EINVAL;
 			if (imf->imf_st[1] == MCAST_EXCLUDE)
 				error = EADDRINUSE;
 			goto out_inp_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Graft new source into filter list for this inpcb's
 	 * membership of the group. The in_multi may not have
 	 * been allocated yet if this is a new membership, however,
 	 * the in_mfilter slot will be allocated and must be initialized.
 	 *
 	 * Note: Grafting of exclusive mode filters doesn't happen
 	 * in this path.
 	 * XXX: Should check for non-NULL lims (node exists but may
 	 * not be in-mode) for interop with full-state API.
 	 */
 	if (ssa->ss.ss_family != AF_UNSPEC) {
 		/* Membership starts in IN mode */
 		if (is_new) {
 			CTR1(KTR_IGMPV3, "%s: new join w/source", __func__);
 			imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE);
 			if (imf == NULL) {
 				error = ENOMEM;
 				goto out_inp_locked;
 			}
 		} else {
 			CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
 		}
 		lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin);
 		if (lims == NULL) {
 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
 			    __func__);
 			error = ENOMEM;
 			goto out_inp_locked;
 		}
 	} else {
 		/* No address specified; Membership starts in EX mode */
 		if (is_new) {
 			CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__);
 			imf = ip_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE);
 			if (imf == NULL) {
 				error = ENOMEM;
 				goto out_inp_locked;
 			}
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	if (is_new) {
 		in_pcbref(inp);
 		INP_WUNLOCK(inp);
 
 		error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf,
 		    &imf->imf_inm);
 
 		INP_WLOCK(inp);
 		if (in_pcbrele_wlocked(inp)) {
 			error = ENXIO;
 			goto out_inp_unlocked;
 		}
 		if (error) {
                         CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed",
                             __func__);
 			goto out_inp_locked;
 		}
 		/*
 		 * NOTE: Refcount from in_joingroup_locked()
 		 * is protecting membership.
 		 */
 		ip_mfilter_insert(&imo->imo_head, imf);
 	} else {
 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 		IN_MULTI_LIST_LOCK();
 		error = inm_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
 				 __func__);
 			IN_MULTI_LIST_UNLOCK();
 			imf_rollback(imf);
 			imf_reap(imf);
 			goto out_inp_locked;
 		}
 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 		error = igmp_change_state(inm);
 		IN_MULTI_LIST_UNLOCK();
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
 			    __func__);
 			imf_rollback(imf);
 			imf_reap(imf);
 			goto out_inp_locked;
 		}
 	}
 
 	imf_commit(imf);
 	imf = NULL;
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 out_inp_unlocked:
 	IN_MULTI_UNLOCK();
 
 	if (is_new && imf) {
 		if (imf->imf_inm != NULL) {
 			IN_MULTI_LIST_LOCK();
 			IF_ADDR_WLOCK(ifp);
 			inm_release_deferred(imf->imf_inm);
 			IF_ADDR_WUNLOCK(ifp);
 			IN_MULTI_LIST_UNLOCK();
 		}
 		ip_mfilter_free(imf);
 	}
 	if_rele(ifp);
 	return (error);
 }
 
 /*
  * Leave an IPv4 multicast group on an inpcb, possibly with a source.
  */
 static int
 inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker		 et;
 	struct group_source_req		 gsr;
 	struct ip_mreq_source		 mreqs;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in_mfilter		*imf;
 	struct ip_moptions		*imo;
 	struct in_msource		*ims;
 	struct in_multi			*inm;
 	int				 error;
 	bool				 is_final;
 
 	ifp = NULL;
 	error = 0;
 	is_final = true;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	switch (sopt->sopt_name) {
 	case IP_DROP_MEMBERSHIP:
 	case IP_DROP_SOURCE_MEMBERSHIP:
 		if (sopt->sopt_name == IP_DROP_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq),
 			    sizeof(struct ip_mreq));
 			/*
 			 * Swap interface and sourceaddr arguments,
 			 * as ip_mreq and ip_mreq_source are laid
 			 * out differently.
 			 */
 			mreqs.imr_interface = mreqs.imr_sourceaddr;
 			mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
 		} else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
 			error = sooptcopyin(sopt, &mreqs,
 			    sizeof(struct ip_mreq_source),
 			    sizeof(struct ip_mreq_source));
 		}
 		if (error)
 			return (error);
 
 		gsa->sin.sin_family = AF_INET;
 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
 
 		if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
 			ssa->sin.sin_family = AF_INET;
 			ssa->sin.sin_len = sizeof(struct sockaddr_in);
 			ssa->sin.sin_addr = mreqs.imr_sourceaddr;
 		}
 
 		/*
 		 * Attempt to look up hinted ifp from interface address.
 		 * Fallthrough with null ifp iff lookup fails, to
 		 * preserve 4.4BSD mcast API idempotence.
 		 * XXX NOTE WELL: The RFC 3678 API is preferred because
 		 * using an IPv4 address as a key is racy.
 		 */
 		if (!in_nullhost(mreqs.imr_interface)) {
 			NET_EPOCH_ENTER(et);
 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
 			/* XXXGL ifref? */
 			NET_EPOCH_EXIT(et);
 		}
 		CTR3(KTR_IGMPV3, "%s: imr_interface = 0x%08x, ifp = %p",
 		    __func__, ntohl(mreqs.imr_interface.s_addr), ifp);
 
 		break;
 
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin.sin_family != AF_INET ||
 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 
 		if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			if (ssa->sin.sin_family != AF_INET ||
 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
 				return (EINVAL);
 		}
 
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		break;
 
 	default:
 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	IN_MULTI_LOCK();
 
 	/*
 	 * Find the membership in the membership list.
 	 */
 	imo = inp_findmoptions(inp);
 	imf = imo_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 	inm = imf->imf_inm;
 
 	if (ssa->ss.ss_family != AF_UNSPEC)
 		is_final = false;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If we were instructed only to leave a given source, do so.
 	 * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
 	 */
 	if (is_final) {
 		ip_mfilter_remove(&imo->imo_head, imf);
 		imf_leave(imf);
 
 		/*
 		 * Give up the multicast address record to which
 		 * the membership points.
 		 */
 		(void) in_leavegroup_locked(imf->imf_inm, imf);
 	} else {
 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
 			error = EADDRNOTAVAIL;
 			goto out_inp_locked;
 		}
 		ims = imo_match_source(imf, &ssa->sa);
 		if (ims == NULL) {
 			CTR3(KTR_IGMPV3, "%s: source 0x%08x %spresent",
 			    __func__, ntohl(ssa->sin.sin_addr.s_addr), "not ");
 			error = EADDRNOTAVAIL;
 			goto out_inp_locked;
 		}
 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
 		error = imf_prune(imf, &ssa->sin);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
 			    __func__);
 			goto out_inp_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	if (!is_final) {
 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 		IN_MULTI_LIST_LOCK();
 		error = inm_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
 			    __func__);
 			IN_MULTI_LIST_UNLOCK();
 			imf_rollback(imf);
 			imf_reap(imf);
 			goto out_inp_locked;
 		}
 
 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 		error = igmp_change_state(inm);
 		IN_MULTI_LIST_UNLOCK();
 		if (error) {
 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
 			    __func__);
 			imf_rollback(imf);
 			imf_reap(imf);
 			goto out_inp_locked;
 		}
 	}
 	imf_commit(imf);
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 
 	if (is_final && imf)
 		ip_mfilter_free(imf);
 
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Select the interface for transmitting IPv4 multicast datagrams.
  *
  * Either an instance of struct in_addr or an instance of struct ip_mreqn
  * may be passed to this socket option. An address of INADDR_ANY or an
  * interface index of 0 is used to remove a previous selection.
  * When no interface is selected, one is chosen for every send.
  */
 static int
 inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct in_addr		 addr;
 	struct ip_mreqn		 mreqn;
 	struct ifnet		*ifp;
 	struct ip_moptions	*imo;
 	int			 error;
 
 	if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
 		/*
 		 * An interface index was specified using the
 		 * Linux-derived ip_mreqn structure.
 		 */
 		error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn),
 		    sizeof(struct ip_mreqn));
 		if (error)
 			return (error);
 
 		if (mreqn.imr_ifindex < 0)
 			return (EINVAL);
 
 		if (mreqn.imr_ifindex == 0) {
 			ifp = NULL;
 		} else {
 			struct epoch_tracker et;
 
 			NET_EPOCH_ENTER(et);
 			ifp = ifnet_byindex(mreqn.imr_ifindex);
 			NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 	} else {
 		/*
 		 * An interface was specified by IPv4 address.
 		 * This is the traditional BSD usage.
 		 */
 		error = sooptcopyin(sopt, &addr, sizeof(struct in_addr),
 		    sizeof(struct in_addr));
 		if (error)
 			return (error);
 		if (in_nullhost(addr)) {
 			ifp = NULL;
 		} else {
 			struct epoch_tracker et;
 
 			NET_EPOCH_ENTER(et);
 			INADDR_TO_IFP(addr, ifp);
 			/* XXXGL ifref? */
 			NET_EPOCH_EXIT(et);
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 		CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = 0x%08x", __func__, ifp,
 		    ntohl(addr.s_addr));
 	}
 
 	/* Reject interfaces which do not support multicast. */
 	if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EOPNOTSUPP);
 
 	imo = inp_findmoptions(inp);
 	imo->imo_multicast_ifp = ifp;
 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 /*
  * Atomically set source filters on a socket for an IPv4 multicast group.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  */
 static int
 inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker	 et;
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct in_mfilter	*imf;
 	struct ip_moptions	*imo;
 	struct in_multi		*inm;
 	int			 error;
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
 		return (ENOBUFS);
 
 	if ((msfr.msfr_fmode != MCAST_EXCLUDE &&
 	     msfr.msfr_fmode != MCAST_INCLUDE))
 		return (EINVAL);
 
 	if (msfr.msfr_group.ss_family != AF_INET ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
 		return (EINVAL);
 
 	gsa->sin.sin_port = 0;	/* ignore port */
 
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 
 	IN_MULTI_LOCK();
 
 	/*
 	 * Take the INP write lock.
 	 * Check if this socket is a member of this group.
 	 */
 	imo = inp_findmoptions(inp);
 	imf = imo_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_inp_locked;
 	}
 	inm = imf->imf_inm;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	imf->imf_st[1] = msfr.msfr_fmode;
 
 	/*
 	 * Apply any new source filters, if present.
 	 * Make a copy of the user-space source vector so
 	 * that we may copy them with a single copyin. This
 	 * allows us to deal with page faults up-front.
 	 */
 	if (msfr.msfr_nsrcs > 0) {
 		struct in_msource	*lims;
 		struct sockaddr_in	*psin;
 		struct sockaddr_storage	*kss, *pkss;
 		int			 i;
 
 		INP_WUNLOCK(inp);
 
 		CTR2(KTR_IGMPV3, "%s: loading %lu source list entries",
 		    __func__, (unsigned long)msfr.msfr_nsrcs);
 		kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_WAITOK);
 		error = copyin(msfr.msfr_srcs, kss,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		if (error) {
 			free(kss, M_TEMP);
 			return (error);
 		}
 
 		INP_WLOCK(inp);
 
 		/*
 		 * Mark all source filters as UNDEFINED at t1.
 		 * Restore new group filter mode, as imf_leave()
 		 * will set it to INCLUDE.
 		 */
 		imf_leave(imf);
 		imf->imf_st[1] = msfr.msfr_fmode;
 
 		/*
 		 * Update socket layer filters at t1, lazy-allocating
 		 * new entries. This saves a bunch of memory at the
 		 * cost of one RB_FIND() per source entry; duplicate
 		 * entries in the msfr_nsrcs vector are ignored.
 		 * If we encounter an error, rollback transaction.
 		 *
 		 * XXX This too could be replaced with a set-symmetric
 		 * difference like loop to avoid walking from root
 		 * every time, as the key space is common.
 		 */
 		for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
 			psin = (struct sockaddr_in *)pkss;
 			if (psin->sin_family != AF_INET) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 			if (psin->sin_len != sizeof(struct sockaddr_in)) {
 				error = EINVAL;
 				break;
 			}
 			error = imf_get_source(imf, psin, &lims);
 			if (error)
 				break;
 			lims->imsl_st[1] = imf->imf_st[1];
 		}
 		free(kss, M_TEMP);
 	}
 
 	if (error)
 		goto out_imf_rollback;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at IGMP layer.
 	 */
 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
 	IN_MULTI_LIST_LOCK();
 	error = inm_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
 		IN_MULTI_LIST_UNLOCK();
 		goto out_imf_rollback;
 	}
 
 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
 	error = igmp_change_state(inm);
 	IN_MULTI_LIST_UNLOCK();
 	if (error)
 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
 
 out_imf_rollback:
 	if (error)
 		imf_rollback(imf);
 	else
 		imf_commit(imf);
 
 	imf_reap(imf);
 
 out_inp_locked:
 	INP_WUNLOCK(inp);
 	IN_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Set the IP multicast options in response to user setsockopt().
  *
  * Many of the socket options handled in this function duplicate the
  * functionality of socket options in the regular unicast API. However,
  * it is not possible to merge the duplicate code, because the idempotence
  * of the IPv4 multicast part of the BSD Sockets API must be preserved;
  * the effects of these options must be treated as separate and distinct.
  *
  * SMPng: XXX: Unlocked read of inp_socket believed OK.
  * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING
  * is refactored to no longer use vifs.
  */
 int
 inp_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip_moptions	*imo;
 	int			 error;
 
 	error = 0;
 
 	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */
 	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
 	case IP_MULTICAST_VIF: {
 		int vifi;
 		/*
 		 * Select a multicast VIF for transmission.
 		 * Only useful if multicast forwarding is active.
 		 */
 		if (legal_vif_num == NULL) {
 			error = EOPNOTSUPP;
 			break;
 		}
 		error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int));
 		if (error)
 			break;
 		if (!legal_vif_num(vifi) && (vifi != -1)) {
 			error = EINVAL;
 			break;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_vif = vifi;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_MULTICAST_IF:
 		error = inp_set_multicast_if(inp, sopt);
 		break;
 
 	case IP_MULTICAST_TTL: {
 		u_char ttl;
 
 		/*
 		 * Set the IP time-to-live for outgoing multicast packets.
 		 * The original multicast API required a char argument,
 		 * which is inconsistent with the rest of the socket API.
 		 * We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == sizeof(u_char)) {
 			error = sooptcopyin(sopt, &ttl, sizeof(u_char),
 			    sizeof(u_char));
 			if (error)
 				break;
 		} else {
 			u_int ittl;
 
 			error = sooptcopyin(sopt, &ittl, sizeof(u_int),
 			    sizeof(u_int));
 			if (error)
 				break;
 			if (ittl > 255) {
 				error = EINVAL;
 				break;
 			}
 			ttl = (u_char)ittl;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_ttl = ttl;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_MULTICAST_LOOP: {
 		u_char loop;
 
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.  The original multicast API required a
 		 * char argument, which is inconsistent with the rest
 		 * of the socket API.  We allow either a char or an int.
 		 */
 		if (sopt->sopt_valsize == sizeof(u_char)) {
 			error = sooptcopyin(sopt, &loop, sizeof(u_char),
 			    sizeof(u_char));
 			if (error)
 				break;
 		} else {
 			u_int iloop;
 
 			error = sooptcopyin(sopt, &iloop, sizeof(u_int),
 					    sizeof(u_int));
 			if (error)
 				break;
 			loop = (u_char)iloop;
 		}
 		imo = inp_findmoptions(inp);
 		imo->imo_multicast_loop = !!loop;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IP_ADD_MEMBERSHIP:
 	case IP_ADD_SOURCE_MEMBERSHIP:
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		error = inp_join_group(inp, sopt);
 		break;
 
 	case IP_DROP_MEMBERSHIP:
 	case IP_DROP_SOURCE_MEMBERSHIP:
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		error = inp_leave_group(inp, sopt);
 		break;
 
 	case IP_BLOCK_SOURCE:
 	case IP_UNBLOCK_SOURCE:
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = inp_block_unblock_source(inp, sopt);
 		break;
 
 	case IP_MSFILTER:
 		error = inp_set_source_filters(inp, sopt);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Expose IGMP's multicast filter mode and source list(s) to userland,
  * keyed by (ifindex, group).
  * The filter mode is written out as a uint32_t, followed by
  * 0..n of struct in_addr.
  * For use by ifmcstat(8).
  * SMPng: NOTE: unlocked read of ifindex space.
  */
 static int
 sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
 {
 	struct in_addr			 src, group;
 	struct epoch_tracker		 et;
 	struct ifnet			*ifp;
 	struct ifmultiaddr		*ifma;
 	struct in_multi			*inm;
 	struct ip_msource		*ims;
 	int				*name;
 	int				 retval;
 	u_int				 namelen;
 	uint32_t			 fmode, ifindex;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 2)
 		return (EINVAL);
 
 	group.s_addr = name[1];
 	if (!IN_MULTICAST(ntohl(group.s_addr))) {
 		CTR2(KTR_IGMPV3, "%s: group 0x%08x is not multicast",
 		    __func__, ntohl(group.s_addr));
 		return (EINVAL);
 	}
 
 	ifindex = name[0];
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		NET_EPOCH_EXIT(et);
 		CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 
 	retval = sysctl_wire_old_buffer(req,
 	    sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr)));
 	if (retval) {
 		NET_EPOCH_EXIT(et);
 		return (retval);
 	}
 
 	IN_MULTI_LIST_LOCK();
 
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = inm_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		if (!in_hosteq(inm->inm_addr, group))
 			continue;
 		fmode = inm->inm_st[1].iss_fmode;
 		retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
 		if (retval != 0)
 			break;
 		RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
 			CTR2(KTR_IGMPV3, "%s: visit node 0x%08x", __func__,
 			    ims->ims_haddr);
 			/*
 			 * Only copy-out sources which are in-mode.
 			 */
 			if (fmode != ims_get_mode(inm, ims, 1)) {
 				CTR1(KTR_IGMPV3, "%s: skip non-in-mode",
 				    __func__);
 				continue;
 			}
 			src.s_addr = htonl(ims->ims_haddr);
 			retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr));
 			if (retval != 0)
 				break;
 		}
 	}
 
 	IN_MULTI_LIST_UNLOCK();
 	NET_EPOCH_EXIT(et);
 
 	return (retval);
 }
 
 #if defined(KTR) && (KTR_COMPILE & KTR_IGMPV3)
 
 static const char *inm_modestrs[] = {
 	[MCAST_UNDEFINED] = "un",
 	[MCAST_INCLUDE] = "in",
 	[MCAST_EXCLUDE] = "ex",
 };
 _Static_assert(MCAST_UNDEFINED == 0 &&
 	       MCAST_EXCLUDE + 1 == nitems(inm_modestrs),
 	       "inm_modestrs: no longer matches #defines");
 
 static const char *
 inm_mode_str(const int mode)
 {
 
 	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
 		return (inm_modestrs[mode]);
 	return ("??");
 }
 
 static const char *inm_statestrs[] = {
 	[IGMP_NOT_MEMBER] = "not-member",
 	[IGMP_SILENT_MEMBER] = "silent",
 	[IGMP_REPORTING_MEMBER] = "reporting",
 	[IGMP_IDLE_MEMBER] = "idle",
 	[IGMP_LAZY_MEMBER] = "lazy",
 	[IGMP_SLEEPING_MEMBER] = "sleeping",
 	[IGMP_AWAKENING_MEMBER] = "awakening",
 	[IGMP_G_QUERY_PENDING_MEMBER] = "query-pending",
 	[IGMP_SG_QUERY_PENDING_MEMBER] = "sg-query-pending",
 	[IGMP_LEAVING_MEMBER] = "leaving",
 };
 _Static_assert(IGMP_NOT_MEMBER == 0 &&
 	       IGMP_LEAVING_MEMBER + 1 == nitems(inm_statestrs),
 	       "inm_statetrs: no longer matches #defines");
 
 static const char *
 inm_state_str(const int state)
 {
 
 	if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER)
 		return (inm_statestrs[state]);
 	return ("??");
 }
 
 /*
  * Dump an in_multi structure to the console.
  */
 void
 inm_print(const struct in_multi *inm)
 {
 	int t;
 	char addrbuf[INET_ADDRSTRLEN];
 
 	if ((ktr_mask & KTR_IGMPV3) == 0)
 		return;
 
 	printf("%s: --- begin inm %p ---\n", __func__, inm);
 	printf("addr %s ifp %p(%s) ifma %p\n",
 	    inet_ntoa_r(inm->inm_addr, addrbuf),
 	    inm->inm_ifp,
 	    inm->inm_ifp->if_xname,
 	    inm->inm_ifma);
 	printf("timer %u state %s refcount %u scq.len %u\n",
 	    inm->inm_timer,
 	    inm_state_str(inm->inm_state),
 	    inm->inm_refcount,
 	    inm->inm_scq.mq_len);
 	printf("igi %p nsrc %lu sctimer %u scrv %u\n",
 	    inm->inm_igi,
 	    inm->inm_nsrc,
 	    inm->inm_sctimer,
 	    inm->inm_scrv);
 	for (t = 0; t < 2; t++) {
 		printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
 		    inm_mode_str(inm->inm_st[t].iss_fmode),
 		    inm->inm_st[t].iss_asm,
 		    inm->inm_st[t].iss_ex,
 		    inm->inm_st[t].iss_in,
 		    inm->inm_st[t].iss_rec);
 	}
 	printf("%s: --- end inm %p ---\n", __func__, inm);
 }
 
 #else /* !KTR || !(KTR_COMPILE & KTR_IGMPV3) */
 
 void
 inm_print(const struct in_multi *inm)
 {
 
 }
 
 #endif /* KTR && (KTR_COMPILE & KTR_IGMPV3) */
 
 RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index e7f425f8593a..0a99981d059b 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -1,3339 +1,3340 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2007-2009 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.4 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ratelimit.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/hash.h>
 #include <sys/systm.h>
 #include <sys/libkern.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/eventhandler.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/if_llatbl.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_pcb_var.h>
 #include <netinet/tcp.h>
 #ifdef INET
 #include <netinet/in_var.h>
 #include <netinet/in_fib.h>
 #endif
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #endif /* INET6 */
 #include <net/route/nhop.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <security/mac/mac_framework.h>
 
 #define	INPCBLBGROUP_SIZMIN	8
 #define	INPCBLBGROUP_SIZMAX	256
 #define	INP_FREED	0x00000200	/* See in_pcb.h. */
 
 /*
  * These configure the range of local port addresses assigned to
  * "unspecified" outgoing connections/packets/whatever.
  */
 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;	/* 600 */
 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;	/* 10000 */
 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;	/* 65535 */
 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;	/* 49152 */
 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;	/* 65535 */
 
 /*
  * Reserved ports accessible only to root. There are significant
  * security considerations that must be accounted for when changing these,
  * but the security benefits can be great. Please be careful.
  */
 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;	/* 1023 */
 VNET_DEFINE(int, ipport_reservedlow);
 
 /* Enable random ephemeral port allocation by default. */
 VNET_DEFINE(int, ipport_randomized) = 1;
 
 #ifdef INET
 static struct inpcb	*in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
 			    struct in_addr faddr, u_int fport_arg,
 			    struct in_addr laddr, u_int lport_arg,
 			    int lookupflags, struct ifnet *ifp,
 			    uint8_t numa_domain);
 
 #define RANGECHK(var, min, max) \
 	if ((var) < (min)) { (var) = (min); } \
 	else if ((var) > (max)) { (var) = (max); }
 
 static int
 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, arg1, arg2, req);
 	if (error == 0) {
 		RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
 		RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
 		RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
 	}
 	return (error);
 }
 
 #undef RANGECHK
 
 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IP Ports");
 
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
     "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
 	CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
 	&VNET_NAME(ipport_reservedhigh), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
 	CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
 	CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
 
 #ifdef RATELIMIT
 counter_u64_t rate_limit_new;
 counter_u64_t rate_limit_chg;
 counter_u64_t rate_limit_active;
 counter_u64_t rate_limit_alloc_fail;
 counter_u64_t rate_limit_set_ok;
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "IP Rate Limiting");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
     &rate_limit_active, "Active rate limited connections");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
    &rate_limit_alloc_fail, "Rate limited connection failures");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
    &rate_limit_set_ok, "Rate limited setting succeeded");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
    &rate_limit_new, "Total Rate limit new attempts");
 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
    &rate_limit_chg, "Total Rate limited change attempts");
 
 #endif /* RATELIMIT */
 
 #endif /* INET */
 
 VNET_DEFINE(uint32_t, in_pcbhashseed);
 static void
 in_pcbhashseed_init(void)
 {
 
 	V_in_pcbhashseed = arc4random();
 }
 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
     in_pcbhashseed_init, 0);
 
 static void in_pcbremhash(struct inpcb *);
 
 /*
  * in_pcb.c: manage the Protocol Control Blocks.
  *
  * NOTE: It is assumed that most of these functions will be called with
  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  * functions often modify hash chains or addresses in pcbs.
  */
 
 static struct inpcblbgroup *
 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred,
     u_char vflag, uint16_t port, const union in_dependaddr *addr, int size,
     uint8_t numa_domain)
 {
 	struct inpcblbgroup *grp;
 	size_t bytes;
 
 	bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
 	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
 	if (grp == NULL)
 		return (NULL);
 	grp->il_cred = crhold(cred);
 	grp->il_vflag = vflag;
 	grp->il_lport = port;
 	grp->il_numa_domain = numa_domain;
 	grp->il_dependladdr = *addr;
 	grp->il_inpsiz = size;
 	CK_LIST_INSERT_HEAD(hdr, grp, il_list);
 	return (grp);
 }
 
 static void
 in_pcblbgroup_free_deferred(epoch_context_t ctx)
 {
 	struct inpcblbgroup *grp;
 
 	grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
 	crfree(grp->il_cred);
 	free(grp, M_PCB);
 }
 
 static void
 in_pcblbgroup_free(struct inpcblbgroup *grp)
 {
 
 	CK_LIST_REMOVE(grp, il_list);
 	NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
 }
 
 static struct inpcblbgroup *
 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
     struct inpcblbgroup *old_grp, int size)
 {
 	struct inpcblbgroup *grp;
 	int i;
 
 	grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag,
 	    old_grp->il_lport, &old_grp->il_dependladdr, size,
 	    old_grp->il_numa_domain);
 	if (grp == NULL)
 		return (NULL);
 
 	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
 	    ("invalid new local group size %d and old local group count %d",
 	     grp->il_inpsiz, old_grp->il_inpcnt));
 
 	for (i = 0; i < old_grp->il_inpcnt; ++i)
 		grp->il_inp[i] = old_grp->il_inp[i];
 	grp->il_inpcnt = old_grp->il_inpcnt;
 	in_pcblbgroup_free(old_grp);
 	return (grp);
 }
 
 /*
  * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
  * and shrink group if possible.
  */
 static void
 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
     int i)
 {
 	struct inpcblbgroup *grp, *new_grp;
 
 	grp = *grpp;
 	for (; i + 1 < grp->il_inpcnt; ++i)
 		grp->il_inp[i] = grp->il_inp[i + 1];
 	grp->il_inpcnt--;
 
 	if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
 	    grp->il_inpcnt <= grp->il_inpsiz / 4) {
 		/* Shrink this group. */
 		new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
 		if (new_grp != NULL)
 			*grpp = new_grp;
 	}
 }
 
 /*
  * Add PCB to load balance group for SO_REUSEPORT_LB option.
  */
 static int
 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
 {
 	const static struct timeval interval = { 60, 0 };
 	static struct timeval lastprint;
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	uint32_t idx;
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 #ifdef INET6
 	/*
 	 * Don't allow IPv4 mapped INET6 wild socket.
 	 */
 	if ((inp->inp_vflag & INP_IPV4) &&
 	    inp->inp_laddr.s_addr == INADDR_ANY &&
 	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
 		return (0);
 	}
 #endif
 
 	idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
 	hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
 		    grp->il_vflag == inp->inp_vflag &&
 		    grp->il_lport == inp->inp_lport &&
 		    grp->il_numa_domain == numa_domain &&
 		    memcmp(&grp->il_dependladdr,
 		    &inp->inp_inc.inc_ie.ie_dependladdr,
 		    sizeof(grp->il_dependladdr)) == 0) {
 			break;
 		}
 	}
 	if (grp == NULL) {
 		/* Create new load balance group. */
 		grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag,
 		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
 		    INPCBLBGROUP_SIZMIN, numa_domain);
 		if (grp == NULL)
 			return (ENOBUFS);
 	} else if (grp->il_inpcnt == grp->il_inpsiz) {
 		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
 			if (ratecheck(&lastprint, &interval))
 				printf("lb group port %d, limit reached\n",
 				    ntohs(grp->il_lport));
 			return (0);
 		}
 
 		/* Expand this local group. */
 		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
 		if (grp == NULL)
 			return (ENOBUFS);
 	}
 
 	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
 	    ("invalid local group size %d and count %d", grp->il_inpsiz,
 	    grp->il_inpcnt));
 
 	grp->il_inp[grp->il_inpcnt] = inp;
 	grp->il_inpcnt++;
 	return (0);
 }
 
 /*
  * Remove PCB from load balance group.
  */
 static void
 in_pcbremlbgrouphash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	int i;
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		for (i = 0; i < grp->il_inpcnt; ++i) {
 			if (grp->il_inp[i] != inp)
 				continue;
 
 			if (grp->il_inpcnt == 1) {
 				/* We are the last, free this local group. */
 				in_pcblbgroup_free(grp);
 			} else {
 				/* Pull up inpcbs, shrink group if possible. */
 				in_pcblbgroup_reorder(hdr, &grp, i);
 			}
 			return;
 		}
 	}
 }
 
 int
 in_pcblbgroup_numa(struct inpcb *inp, int arg)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	int err, i;
 	uint8_t numa_domain;
 
 	switch (arg) {
 	case TCP_REUSPORT_LB_NUMA_NODOM:
 		numa_domain = M_NODOM;
 		break;
 	case TCP_REUSPORT_LB_NUMA_CURDOM:
 		numa_domain = PCPU_GET(domain);
 		break;
 	default:
 		if (arg < 0 || arg >= vm_ndomains)
 			return (EINVAL);
 		numa_domain = arg;
 	}
 
 	err = 0;
 	pcbinfo = inp->inp_pcbinfo;
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK(pcbinfo);
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		for (i = 0; i < grp->il_inpcnt; ++i) {
 			if (grp->il_inp[i] != inp)
 				continue;
 
 			if (grp->il_numa_domain == numa_domain) {
 				goto abort_with_hash_wlock;
 			}
 
 			/* Remove it from the old group. */
 			in_pcbremlbgrouphash(inp);
 
 			/* Add it to the new group based on numa domain. */
 			in_pcbinslbgrouphash(inp, numa_domain);
 			goto abort_with_hash_wlock;
 		}
 	}
 	err = ENOENT;
 abort_with_hash_wlock:
 	INP_HASH_WUNLOCK(pcbinfo);
 	return (err);
 }
 
 /* Make sure it is safe to use hashinit(9) on CK_LIST. */
 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
 
 /*
  * Initialize an inpcbinfo - a per-VNET instance of connections db.
  */
 void
 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
     u_int hash_nelements, u_int porthash_nelements)
 {
 
 	mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
 	mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
 	    NULL, MTX_DEF);
 #ifdef VIMAGE
 	pcbinfo->ipi_vnet = curvnet;
 #endif
 	CK_LIST_INIT(&pcbinfo->ipi_listhead);
 	pcbinfo->ipi_count = 0;
 	pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
 	    &pcbinfo->ipi_hashmask);
 	porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_porthashmask);
 	pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_lbgrouphashmask);
 	pcbinfo->ipi_zone = pcbstor->ips_zone;
 	pcbinfo->ipi_portzone = pcbstor->ips_portzone;
 	pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
 }
 
 /*
  * Destroy an inpcbinfo.
  */
 void
 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 {
 
 	KASSERT(pcbinfo->ipi_count == 0,
 	    ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
 
 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
 	    pcbinfo->ipi_porthashmask);
 	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
 	    pcbinfo->ipi_lbgrouphashmask);
 	mtx_destroy(&pcbinfo->ipi_hash_lock);
 	mtx_destroy(&pcbinfo->ipi_lock);
 }
 
 /*
  * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
  */
 static void inpcb_dtor(void *, int, void *);
 static void inpcb_fini(void *, int);
 void
 in_pcbstorage_init(void *arg)
 {
 	struct inpcbstorage *pcbstor = arg;
 
 	pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
 	    pcbstor->ips_size, NULL, inpcb_dtor, pcbstor->ips_pcbinit,
 	    inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
 	pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
 	    sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_smr(pcbstor->ips_portzone,
 	    uma_zone_get_smr(pcbstor->ips_zone));
 }
 
 /*
  * Destroy a pcbstorage - used by unloadable protocols.
  */
 void
 in_pcbstorage_destroy(void *arg)
 {
 	struct inpcbstorage *pcbstor = arg;
 
 	uma_zdestroy(pcbstor->ips_zone);
 	uma_zdestroy(pcbstor->ips_portzone);
 }
 
 /*
  * Allocate a PCB and associate it with the socket.
  * On success return with the PCB locked.
  */
 int
 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
 {
 	struct inpcb *inp;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
 	int error;
 #endif
 
 	inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
 	if (inp == NULL)
 		return (ENOBUFS);
 	bzero(&inp->inp_start_zero, inp_zero_size);
 #ifdef NUMA
 	inp->inp_numa_domain = M_NODOM;
 #endif
 	inp->inp_pcbinfo = pcbinfo;
 	inp->inp_socket = so;
 	inp->inp_cred = crhold(so->so_cred);
 	inp->inp_inc.inc_fibnum = so->so_fibnum;
 #ifdef MAC
 	error = mac_inpcb_init(inp, M_NOWAIT);
 	if (error != 0)
 		goto out;
 	mac_inpcb_create(so, inp);
 #endif
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	error = ipsec_init_pcbpolicy(inp);
 	if (error != 0) {
 #ifdef MAC
 		mac_inpcb_destroy(inp);
 #endif
 		goto out;
 	}
 #endif /*IPSEC*/
 #ifdef INET6
 	if (INP_SOCKAF(so) == AF_INET6) {
 		inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
 		if (V_ip6_v6only)
 			inp->inp_flags |= IN6P_IPV6_V6ONLY;
 #ifdef INET
 		else
 			inp->inp_vflag |= INP_IPV4;
 #endif
 		if (V_ip6_auto_flowlabel)
 			inp->inp_flags |= IN6P_AUTOFLOWLABEL;
 		inp->in6p_hops = -1;	/* use kernel default */
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		inp->inp_vflag |= INP_IPV4;
 #endif
 	/*
 	 * Routes in inpcb's can cache L2 as well; they are guaranteed
 	 * to be cleaned up.
 	 */
 	inp->inp_route.ro_flags = RT_LLE_CACHE;
 	refcount_init(&inp->inp_refcount, 1);   /* Reference from socket. */
 	INP_WLOCK(inp);
 	INP_INFO_WLOCK(pcbinfo);
 	pcbinfo->ipi_count++;
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
 	INP_INFO_WUNLOCK(pcbinfo);
 	so->so_pcb = inp;
 
 	return (0);
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
 out:
 	uma_zfree_smr(pcbinfo->ipi_zone, inp);
 	return (error);
 #endif
 }
 
 #ifdef INET
 int
 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 {
 	int anonport, error;
 
 	KASSERT(nam == NULL || nam->sa_family == AF_INET,
 	    ("%s: invalid address family for %p", __func__, nam));
 	KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in),
 	    ("%s: invalid address length for %p", __func__, nam));
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
 	error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
 	    &inp->inp_lport, cred);
 	if (error)
 		return (error);
 	if (in_pcbinshash(inp) != 0) {
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		return (EAGAIN);
 	}
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 #endif
 
 #if defined(INET) || defined(INET6)
 /*
  * Assign a local port like in_pcb_lport(), but also used with connect()
  * and a foreign address and port.  If fsa is non-NULL, choose a local port
  * that is unused with those, otherwise one that is completely unused.
  * lsa can be NULL for IPv6.
  */
 int
 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
     struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
 {
 	struct inpcbinfo *pcbinfo;
 	struct inpcb *tmpinp;
 	unsigned short *lastport;
 	int count, error;
 	u_short aux, first, last, lport;
 #ifdef INET
 	struct in_addr laddr, faddr;
 #endif
 #ifdef INET6
 	struct in6_addr *laddr6, *faddr6;
 #endif
 
 	pcbinfo = inp->inp_pcbinfo;
 
 	/*
 	 * Because no actual state changes occur here, a global write lock on
 	 * the pcbinfo isn't required.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if (inp->inp_flags & INP_HIGHPORT) {
 		first = V_ipport_hifirstauto;	/* sysctl */
 		last  = V_ipport_hilastauto;
 		lastport = &pcbinfo->ipi_lasthi;
 	} else if (inp->inp_flags & INP_LOWPORT) {
 		error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
 		if (error)
 			return (error);
 		first = V_ipport_lowfirstauto;	/* 1023 */
 		last  = V_ipport_lowlastauto;	/* 600 */
 		lastport = &pcbinfo->ipi_lastlow;
 	} else {
 		first = V_ipport_firstauto;	/* sysctl */
 		last  = V_ipport_lastauto;
 		lastport = &pcbinfo->ipi_lastport;
 	}
 
 	/*
 	 * Instead of having two loops further down counting up or down
 	 * make sure that first is always <= last and go with only one
 	 * code path implementing all logic.
 	 */
 	if (first > last) {
 		aux = first;
 		first = last;
 		last = aux;
 	}
 
 #ifdef INET
 	laddr.s_addr = INADDR_ANY;	/* used by INET6+INET below too */
 	if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
 		if (lsa != NULL)
 			laddr = ((struct sockaddr_in *)lsa)->sin_addr;
 		if (fsa != NULL)
 			faddr = ((struct sockaddr_in *)fsa)->sin_addr;
 	}
 #endif
 #ifdef INET6
 	laddr6 = NULL;
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		if (lsa != NULL)
 			laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
 		if (fsa != NULL)
 			faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
 	}
 #endif
 
 	tmpinp = NULL;
 	lport = *lportp;
 
 	if (V_ipport_randomized)
 		*lastport = first + (arc4random() % (last - first));
 
 	count = last - first;
 
 	do {
 		if (count-- < 0)	/* completely used? */
 			return (EADDRNOTAVAIL);
 		++*lastport;
 		if (*lastport < first || *lastport > last)
 			*lastport = first;
 		lport = htons(*lastport);
 
 		if (fsa != NULL) {
 #ifdef INET
 			if (lsa->sa_family == AF_INET) {
 				tmpinp = in_pcblookup_hash_locked(pcbinfo,
 				    faddr, fport, laddr, lport, lookupflags,
 				    NULL, M_NODOM);
 			}
 #endif
 #ifdef INET6
 			if (lsa->sa_family == AF_INET6) {
 				tmpinp = in6_pcblookup_hash_locked(pcbinfo,
 				    faddr6, fport, laddr6, lport, lookupflags,
 				    NULL, M_NODOM);
 			}
 #endif
 		} else {
 #ifdef INET6
 			if ((inp->inp_vflag & INP_IPV6) != 0) {
 				tmpinp = in6_pcblookup_local(pcbinfo,
 				    &inp->in6p_laddr, lport, lookupflags, cred);
 #ifdef INET
 				if (tmpinp == NULL &&
 				    (inp->inp_vflag & INP_IPV4))
 					tmpinp = in_pcblookup_local(pcbinfo,
 					    laddr, lport, lookupflags, cred);
 #endif
 			}
 #endif
 #if defined(INET) && defined(INET6)
 			else
 #endif
 #ifdef INET
 				tmpinp = in_pcblookup_local(pcbinfo, laddr,
 				    lport, lookupflags, cred);
 #endif
 		}
 	} while (tmpinp != NULL);
 
 	*lportp = lport;
 
 	return (0);
 }
 
 /*
  * Select a local port (number) to use.
  */
 int
 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
     struct ucred *cred, int lookupflags)
 {
 	struct sockaddr_in laddr;
 
 	if (laddrp) {
 		bzero(&laddr, sizeof(laddr));
 		laddr.sin_family = AF_INET;
 		laddr.sin_addr = *laddrp;
 	}
 	return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
 	    NULL, lportp, NULL, 0, cred, lookupflags));
 }
 
 /*
  * Return cached socket options.
  */
 int
 inp_so_options(const struct inpcb *inp)
 {
 	int so_options;
 
 	so_options = 0;
 
 	if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
 		so_options |= SO_REUSEPORT_LB;
 	if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
 		so_options |= SO_REUSEPORT;
 	if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
 		so_options |= SO_REUSEADDR;
 	return (so_options);
 }
 #endif /* INET || INET6 */
 
 /*
  * Check if a new BINDMULTI socket is allowed to be created.
  *
  * ni points to the new inp.
  * oi points to the existing inp.
  *
  * This checks whether the existing inp also has BINDMULTI and
  * whether the credentials match.
  */
 int
 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
 {
 	/* Check permissions match */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    (ni->inp_cred->cr_uid !=
 	    oi->inp_cred->cr_uid))
 		return (0);
 
 	/* Check the existing inp has BINDMULTI set */
 	if ((ni->inp_flags2 & INP_BINDMULTI) &&
 	    ((oi->inp_flags2 & INP_BINDMULTI) == 0))
 		return (0);
 
 	/*
 	 * We're okay - either INP_BINDMULTI isn't set on ni, or
 	 * it is and it matches the checks.
 	 */
 	return (1);
 }
 
 #ifdef INET
 /*
  * Set up a bind operation on a PCB, performing port allocation
  * as required, but do not actually modify the PCB. Callers can
  * either complete the bind by setting inp_laddr/inp_lport and
  * calling in_pcbinshash(), or they can just use the resulting
  * port and address to authorise the sending of a once-off packet.
  *
  * On error, the values of *laddrp and *lportp are not changed.
  */
 int
 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
     u_short *lportp, struct ucred *cred)
 {
 	struct socket *so = inp->inp_socket;
 	struct sockaddr_in *sin;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct in_addr laddr;
 	u_short lport = 0;
 	int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
 	int error;
 
 	/*
 	 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
 	 * so that we don't have to add to the (already messy) code below.
 	 */
 	int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
 
 	/*
 	 * No state changes, so read locks are sufficient here.
 	 */
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	laddr.s_addr = *laddrp;
 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
 	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
 			return (error);
 	} else {
 		sin = (struct sockaddr_in *)nam;
 		KASSERT(sin->sin_family == AF_INET,
 		    ("%s: invalid family for address %p", __func__, sin));
 		KASSERT(sin->sin_len == sizeof(*sin),
 		    ("%s: invalid length for address %p", __func__, sin));
 
 		error = prison_local_ip4(cred, &sin->sin_addr);
 		if (error)
 			return (error);
 		if (sin->sin_port != *lportp) {
 			/* Don't allow the port to change. */
 			if (*lportp != 0)
 				return (EINVAL);
 			lport = sin->sin_port;
 		}
 		/* NB: lport is left as 0 if the port isn't being changed. */
 		if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
 			/*
 			 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
 			 * allow complete duplication of binding if
 			 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
 			 * and a multicast address is bound on both
 			 * new and duplicated sockets.
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
 			/*
 			 * XXX: How to deal with SO_REUSEPORT_LB here?
 			 * Treat same as SO_REUSEPORT for now.
 			 */
 			if ((so->so_options &
 			    (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
 				reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
 			sin->sin_port = 0;		/* yech... */
 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 			/*
 			 * Is the address a local IP address?
 			 * If INP_BINDANY is set, then the socket may be bound
 			 * to any endpoint address, local or not.
 			 */
 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
 			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
 				return (EADDRNOTAVAIL);
 		}
 		laddr = sin->sin_addr;
 		if (lport) {
 			struct inpcb *t;
 
 			/* GROSS */
 			if (ntohs(lport) <= V_ipport_reservedhigh &&
 			    ntohs(lport) >= V_ipport_reservedlow &&
 			    priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
 				return (EACCES);
 			if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
 			    priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
 				t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 				    lport, INPLOOKUP_WILDCARD, cred);
 	/*
 	 * XXX
 	 * This entire block sorely needs a rewrite.
 	 */
 				if (t &&
 				    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 				    (so->so_type != SOCK_STREAM ||
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 				     (t->inp_flags2 & INP_REUSEPORT) ||
 				     (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
 
 				/*
 				 * If the socket is a BINDMULTI socket, then
 				 * the credentials need to match and the
 				 * original socket also has to have been bound
 				 * with BINDMULTI.
 				 */
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 			t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 			    lport, lookupflags, cred);
 			if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 			    (reuseport & inp_so_options(t)) == 0 &&
 			    (reuseport_lb & inp_so_options(t)) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
 				    ntohl(t->inp_laddr.s_addr) !=
 				    INADDR_ANY ||
 				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
 #endif
 						return (EADDRINUSE);
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
 		}
 	}
 	if (*lportp != 0)
 		lport = *lportp;
 	if (lport == 0) {
 		error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
 		if (error != 0)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	return (0);
 }
 
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
  * If don't have a local address for this socket yet,
  * then pick one.
  */
 int
 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred,
     bool rehash)
 {
 	u_short lport, fport;
 	in_addr_t laddr, faddr;
 	int anonport, error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	lport = inp->inp_lport;
 	laddr = inp->inp_laddr.s_addr;
 	anonport = (lport == 0);
 	error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 	    NULL, cred);
 	if (error)
 		return (error);
 
 	/* Do the initial binding of the local address if required. */
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 		KASSERT(rehash == true,
 		    ("Rehashing required for unbound inps"));
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_laddr.s_addr = INADDR_ANY;
 			inp->inp_lport = 0;
 			return (EAGAIN);
 		}
 	}
 
 	/* Commit the remaining changes. */
 	inp->inp_lport = lport;
 	inp->inp_laddr.s_addr = laddr;
 	inp->inp_faddr.s_addr = faddr;
 	inp->inp_fport = fport;
 	if (rehash) {
 		in_pcbrehash(inp);
 	} else {
 		in_pcbinshash(inp);
 	}
 
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
 }
 
 /*
  * Do proper source address selection on an unbound socket in case
  * of connect. Take jails into account as well.
  */
 int
 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
     struct ucred *cred)
 {
 	struct ifaddr *ifa;
 	struct sockaddr *sa;
 	struct sockaddr_in *sin, dst;
 	struct nhop_object *nh;
 	int error;
 
 	NET_EPOCH_ASSERT();
 	KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
 
 	/*
 	 * Bypass source address selection and use the primary jail IP
 	 * if requested.
 	 */
 	if (!prison_saddrsel_ip4(cred, laddr))
 		return (0);
 
 	error = 0;
 
 	nh = NULL;
 	bzero(&dst, sizeof(dst));
 	sin = &dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_addr.s_addr = faddr->s_addr;
 
 	/*
 	 * If route is known our src addr is taken from the i/f,
 	 * else punt.
 	 *
 	 * Find out route to destination.
 	 */
 	if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 		nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
 		    0, NHR_NONE, 0);
 
 	/*
 	 * If we found a route, use the address corresponding to
 	 * the outgoing interface.
 	 *
 	 * Otherwise assume faddr is reachable on a directly connected
 	 * network and try to find a corresponding interface to take
 	 * the source address from.
 	 */
 	if (nh == NULL || nh->nh_ifp == NULL) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL) {
 			ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
 						inp->inp_socket->so_fibnum));
 		}
 		if (ia == NULL) {
 			error = ENETUNREACH;
 			goto done;
 		}
 
 		if (!prison_flag(cred, PR_IP4)) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		ifp = ia->ia_ifp;
 		ia = NULL;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * If the outgoing interface on the route found is not
 	 * a loopback interface, use the address from that interface.
 	 * In case of jails do those three steps:
 	 * 1. check if the interface address belongs to the jail. If so use it.
 	 * 2. check if we have any address on the outgoing interface
 	 *    belonging to this jail. If so use it.
 	 * 3. as a last resort return the 'default' jail address.
 	 */
 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
 		struct in_ifaddr *ia;
 		struct ifnet *ifp;
 
 		/* If not jailed, use the default returned. */
 		if (!prison_flag(cred, PR_IP4)) {
 			ia = (struct in_ifaddr *)nh->nh_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		/* 1. Check if the iface address belongs to the jail. */
 		sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
 		if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 			ia = (struct in_ifaddr *)nh->nh_ifa;
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/*
 		 * 2. Check if we have any address on the outgoing interface
 		 *    belonging to this jail.
 		 */
 		ia = NULL;
 		ifp = nh->nh_ifp;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			sin = (struct sockaddr_in *)sa;
 			if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 				ia = (struct in_ifaddr *)ifa;
 				break;
 			}
 		}
 		if (ia != NULL) {
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 	/*
 	 * The outgoing interface is marked with 'loopback net', so a route
 	 * to ourselves is here.
 	 * Try to find the interface of the destination address and then
 	 * take the address from there. That interface is not necessarily
 	 * a loopback interface.
 	 * In case of jails, check that it is an address of the jail
 	 * and if we cannot find, fall back to the 'default' jail address.
 	 */
 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
 		struct in_ifaddr *ia;
 
 		ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
 					inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
 						inp->inp_socket->so_fibnum));
 		if (ia == NULL)
 			ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
 
 		if (!prison_flag(cred, PR_IP4)) {
 			if (ia == NULL) {
 				error = ENETUNREACH;
 				goto done;
 			}
 			laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 			goto done;
 		}
 
 		/* Jailed. */
 		if (ia != NULL) {
 			struct ifnet *ifp;
 
 			ifp = ia->ia_ifp;
 			ia = NULL;
 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				sa = ifa->ifa_addr;
 				if (sa->sa_family != AF_INET)
 					continue;
 				sin = (struct sockaddr_in *)sa;
 				if (prison_check_ip4(cred,
 				    &sin->sin_addr) == 0) {
 					ia = (struct in_ifaddr *)ifa;
 					break;
 				}
 			}
 			if (ia != NULL) {
 				laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 				goto done;
 			}
 		}
 
 		/* 3. As a last resort return the 'default' jail address. */
 		error = prison_get_ip4(cred, laddr);
 		goto done;
 	}
 
 done:
 	return (error);
 }
 
 /*
  * Set up for a connect from a socket to the specified address.
  * On entry, *laddrp and *lportp should contain the current local
  * address and port for the PCB; these are updated to the values
  * that should be placed in inp_laddr and inp_lport to complete
  * the connect.
  *
  * On success, *faddrp and *fportp will be set to the remote address
  * and port. These are not updated in the error case.
  *
  * If the operation fails because the connection already exists,
  * *oinpp will be set to the PCB of that connection so that the
  * caller can decide to override it. In all other cases, *oinpp
  * is set to NULL.
  */
 int
 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
     struct inpcb **oinpp, struct ucred *cred)
 {
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	struct in_ifaddr *ia;
 	struct inpcb *oinp;
 	struct in_addr laddr, faddr;
 	u_short lport, fport;
 	int error;
 
 	KASSERT(sin->sin_family == AF_INET,
 	    ("%s: invalid address family for %p", __func__, sin));
 	KASSERT(sin->sin_len == sizeof(*sin),
 	    ("%s: invalid address length for %p", __func__, sin));
 
 	/*
 	 * Because a global state change doesn't actually occur here, a read
 	 * lock is sufficient.
 	 */
 	NET_EPOCH_ASSERT();
 	INP_LOCK_ASSERT(inp);
 	INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
 
 	if (oinpp != NULL)
 		*oinpp = NULL;
 	if (sin->sin_port == 0)
 		return (EADDRNOTAVAIL);
 	laddr.s_addr = *laddrp;
 	lport = *lportp;
 	faddr = sin->sin_addr;
 	fport = sin->sin_port;
 #ifdef ROUTE_MPATH
 	if (CALC_FLOWID_OUTBOUND) {
 		uint32_t hash_val, hash_type;
 
 		hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
 		    inp->inp_socket->so_proto->pr_protocol, &hash_type);
 
 		inp->inp_flowid = hash_val;
 		inp->inp_flowtype = hash_type;
 	}
 #endif
 	if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
 		/*
 		 * If the destination address is INADDR_ANY,
 		 * use the primary local address.
 		 * If the supplied address is INADDR_BROADCAST,
 		 * and the primary interface supports broadcast,
 		 * choose the broadcast address for that interface.
 		 */
 		if (faddr.s_addr == INADDR_ANY) {
 			faddr =
 			    IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
 			if ((error = prison_get_ip4(cred, &faddr)) != 0)
 				return (error);
 		} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
 			if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 			    IFF_BROADCAST)
 				faddr = satosin(&CK_STAILQ_FIRST(
 				    &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 		}
 	}
 	if (laddr.s_addr == INADDR_ANY) {
 		error = in_pcbladdr(inp, &faddr, &laddr, cred);
 		/*
 		 * If the destination address is multicast and an outgoing
 		 * interface has been set as a multicast option, prefer the
 		 * address of that interface as our source address.
 		 */
 		if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 		    inp->inp_moptions != NULL) {
 			struct ip_moptions *imo;
 			struct ifnet *ifp;
 
 			imo = inp->inp_moptions;
 			if (imo->imo_multicast_ifp != NULL) {
 				ifp = imo->imo_multicast_ifp;
 				CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 					if (ia->ia_ifp == ifp &&
 					    prison_check_ip4(cred,
 					    &ia->ia_addr.sin_addr) == 0)
 						break;
 				}
 				if (ia == NULL)
 					error = EADDRNOTAVAIL;
 				else {
 					laddr = ia->ia_addr.sin_addr;
 					error = 0;
 				}
 			}
 		}
 		if (error)
 			return (error);
 	}
 
 	if (lport != 0) {
 		oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
 		    fport, laddr, lport, 0, NULL, M_NODOM);
 		if (oinp != NULL) {
 			if (oinpp != NULL)
 				*oinpp = oinp;
 			return (EADDRINUSE);
 		}
 	} else {
 		struct sockaddr_in lsin, fsin;
 
 		bzero(&lsin, sizeof(lsin));
 		bzero(&fsin, sizeof(fsin));
 		lsin.sin_family = AF_INET;
 		lsin.sin_addr = laddr;
 		fsin.sin_family = AF_INET;
 		fsin.sin_addr = faddr;
 		error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
 		    &lport, (struct sockaddr *)& fsin, fport, cred,
 		    INPLOOKUP_WILDCARD);
 		if (error)
 			return (error);
 	}
 	*laddrp = laddr.s_addr;
 	*lportp = lport;
 	*faddrp = faddr.s_addr;
 	*fportp = fport;
 	return (0);
 }
 
 void
 in_pcbdisconnect(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
 	in_pcbrehash(inp);
 }
 #endif /* INET */
 
 /*
  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
  * For most protocols, this will be invoked immediately prior to calling
  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
  * socket, in which case in_pcbfree() is deferred.
  */
 void
 in_pcbdetach(struct inpcb *inp)
 {
 
 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
 #ifdef RATELIMIT
 	if (inp->inp_snd_tag != NULL)
 		in_pcbdetach_txrtlmt(inp);
 #endif
 	inp->inp_socket->so_pcb = NULL;
 	inp->inp_socket = NULL;
 }
 
 /*
  * inpcb hash lookups are protected by SMR section.
  *
  * Once desired pcb has been found, switching from SMR section to a pcb
  * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
  * here because SMR is a critical section.
  * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
  */
 static inline void
 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
 {
 
 	lock == INPLOOKUP_RLOCKPCB ?
 	    rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
 }
 
 static inline void
 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
 {
 
 	lock == INPLOOKUP_RLOCKPCB ?
 	    rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
 }
 
 static inline int
 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
 {
 
 	return (lock == INPLOOKUP_RLOCKPCB ?
 	    rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
 }
 
 static inline bool
 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
 {
 
 	return (lock == INPLOOKUP_RLOCKPCB ?
 	    in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
 }
 
 static inline bool
 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
 {
 
 	MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
 	SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
 
 	if (__predict_true(inp_trylock(inp, lock))) {
 		if (__predict_false(inp->inp_flags & ignflags)) {
 			smr_exit(inp->inp_pcbinfo->ipi_smr);
 			inp_unlock(inp, lock);
 			return (false);
 		}
 		smr_exit(inp->inp_pcbinfo->ipi_smr);
 		return (true);
 	}
 
 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
 		smr_exit(inp->inp_pcbinfo->ipi_smr);
 		inp_lock(inp, lock);
 		if (__predict_false(in_pcbrele(inp, lock)))
 			return (false);
 		/*
 		 * inp acquired through refcount & lock for sure didn't went
 		 * through uma_zfree().  However, it may have already went
 		 * through in_pcbfree() and has another reference, that
 		 * prevented its release by our in_pcbrele().
 		 */
 		if (__predict_false(inp->inp_flags & ignflags)) {
 			inp_unlock(inp, lock);
 			return (false);
 		}
 		return (true);
 	} else {
 		smr_exit(inp->inp_pcbinfo->ipi_smr);
 		return (false);
 	}
 }
 
 bool
 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
 {
 
 	/*
 	 * in_pcblookup() family of functions ignore not only freed entries,
 	 * that may be found due to lockless access to the hash, but dropped
 	 * entries, too.
 	 */
 	return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED));
 }
 
 /*
  * inp_next() - inpcb hash/list traversal iterator
  *
  * Requires initialized struct inpcb_iterator for context.
  * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
  *
  * - Iterator can have either write-lock or read-lock semantics, that can not
  *   be changed later.
  * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
  *   a single hash slot.  Note: only rip_input() does the latter.
  * - Iterator may have optional bool matching function.  The matching function
  *   will be executed for each inpcb in the SMR context, so it can not acquire
  *   locks and can safely access only immutable fields of inpcb.
  *
  * A fresh initialized iterator has NULL inpcb in its context and that
  * means that inp_next() call would return the very first inpcb on the list
  * locked with desired semantic.  In all following calls the context pointer
  * shall hold the current inpcb pointer.  The KPI user is not supposed to
  * unlock the current inpcb!  Upon end of traversal inp_next() will return NULL
  * and write NULL to its context.  After end of traversal an iterator can be
  * reused.
  *
  * List traversals have the following features/constraints:
  * - New entries won't be seen, as they are always added to the head of a list.
  * - Removed entries won't stop traversal as long as they are not added to
  *   a different list. This is violated by in_pcbrehash().
  */
 #define	II_LIST_FIRST(ipi, hash)					\
 		(((hash) == INP_ALL_LIST) ?				\
 		    CK_LIST_FIRST(&(ipi)->ipi_listhead) :		\
 		    CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)]))
 #define	II_LIST_NEXT(inp, hash)						\
 		(((hash) == INP_ALL_LIST) ?				\
 		    CK_LIST_NEXT((inp), inp_list) :			\
 		    CK_LIST_NEXT((inp), inp_hash))
 #define	II_LOCK_ASSERT(inp, lock)					\
 		rw_assert(&(inp)->inp_lock,				\
 		    (lock) == INPLOOKUP_RLOCKPCB ?  RA_RLOCKED : RA_WLOCKED )
 struct inpcb *
 inp_next(struct inpcb_iterator *ii)
 {
 	const struct inpcbinfo *ipi = ii->ipi;
 	inp_match_t *match = ii->match;
 	void *ctx = ii->ctx;
 	inp_lookup_t lock = ii->lock;
 	int hash = ii->hash;
 	struct inpcb *inp;
 
 	if (ii->inp == NULL) {		/* First call. */
 		smr_enter(ipi->ipi_smr);
 		/* This is unrolled CK_LIST_FOREACH(). */
 		for (inp = II_LIST_FIRST(ipi, hash);
 		    inp != NULL;
 		    inp = II_LIST_NEXT(inp, hash)) {
 			if (match != NULL && (match)(inp, ctx) == false)
 				continue;
 			if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
 				break;
 			else {
 				smr_enter(ipi->ipi_smr);
 				MPASS(inp != II_LIST_FIRST(ipi, hash));
 				inp = II_LIST_FIRST(ipi, hash);
 				if (inp == NULL)
 					break;
 			}
 		}
 
 		if (inp == NULL)
 			smr_exit(ipi->ipi_smr);
 		else
 			ii->inp = inp;
 
 		return (inp);
 	}
 
 	/* Not a first call. */
 	smr_enter(ipi->ipi_smr);
 restart:
 	inp = ii->inp;
 	II_LOCK_ASSERT(inp, lock);
 next:
 	inp = II_LIST_NEXT(inp, hash);
 	if (inp == NULL) {
 		smr_exit(ipi->ipi_smr);
 		goto found;
 	}
 
 	if (match != NULL && (match)(inp, ctx) == false)
 		goto next;
 
 	if (__predict_true(inp_trylock(inp, lock))) {
 		if (__predict_false(inp->inp_flags & INP_FREED)) {
 			/*
 			 * Entries are never inserted in middle of a list, thus
 			 * as long as we are in SMR, we can continue traversal.
 			 * Jump to 'restart' should yield in the same result,
 			 * but could produce unnecessary looping.  Could this
 			 * looping be unbound?
 			 */
 			inp_unlock(inp, lock);
 			goto next;
 		} else {
 			smr_exit(ipi->ipi_smr);
 			goto found;
 		}
 	}
 
 	/*
 	 * Can't obtain lock immediately, thus going hard.  Once we exit the
 	 * SMR section we can no longer jump to 'next', and our only stable
 	 * anchoring point is ii->inp, which we keep locked for this case, so
 	 * we jump to 'restart'.
 	 */
 	if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
 		smr_exit(ipi->ipi_smr);
 		inp_lock(inp, lock);
 		if (__predict_false(in_pcbrele(inp, lock))) {
 			smr_enter(ipi->ipi_smr);
 			goto restart;
 		}
 		/*
 		 * See comment in inp_smr_lock().
 		 */
 		if (__predict_false(inp->inp_flags & INP_FREED)) {
 			inp_unlock(inp, lock);
 			smr_enter(ipi->ipi_smr);
 			goto restart;
 		}
 	} else
 		goto next;
 
 found:
 	inp_unlock(ii->inp, lock);
 	ii->inp = inp;
 
 	return (ii->inp);
 }
 
 /*
  * in_pcbref() bumps the reference count on an inpcb in order to maintain
  * stability of an inpcb pointer despite the inpcb lock being released or
  * SMR section exited.
  *
  * To free a reference later in_pcbrele_(r|w)locked() must be performed.
  */
 void
 in_pcbref(struct inpcb *inp)
 {
 	u_int old __diagused;
 
 	old = refcount_acquire(&inp->inp_refcount);
 	KASSERT(old > 0, ("%s: refcount 0", __func__));
 }
 
 /*
  * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
  * freeing the pcb, if the reference was very last.
  */
 bool
 in_pcbrele_rlocked(struct inpcb *inp)
 {
 
 	INP_RLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0)
 		return (false);
 
 	MPASS(inp->inp_flags & INP_FREED);
 	MPASS(inp->inp_socket == NULL);
 	MPASS(inp->inp_in_hpts == 0);
 	INP_RUNLOCK(inp);
 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
 	return (true);
 }
 
 bool
 in_pcbrele_wlocked(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (refcount_release(&inp->inp_refcount) == 0)
 		return (false);
 
 	MPASS(inp->inp_flags & INP_FREED);
 	MPASS(inp->inp_socket == NULL);
 	MPASS(inp->inp_in_hpts == 0);
 	INP_WUNLOCK(inp);
 	uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
 	return (true);
 }
 
 /*
  * Unconditionally schedule an inpcb to be freed by decrementing its
  * reference count, which should occur only after the inpcb has been detached
  * from its socket.  If another thread holds a temporary reference (acquired
  * using in_pcbref()) then the free is deferred until that reference is
  * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
  *  Almost all work, including removal from global lists, is done in this
  * context, where the pcbinfo lock is held.
  */
 void
 in_pcbfree(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 #ifdef INET
 	struct ip_moptions *imo;
 #endif
 #ifdef INET6
 	struct ip6_moptions *im6o;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 	KASSERT((inp->inp_flags & INP_FREED) == 0,
 	    ("%s: called twice for pcb %p", __func__, inp));
 
 	inp->inp_flags |= INP_FREED;
 	INP_INFO_WLOCK(pcbinfo);
 	inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 	pcbinfo->ipi_count--;
 	CK_LIST_REMOVE(inp, inp_list);
 	INP_INFO_WUNLOCK(pcbinfo);
 
 	if (inp->inp_flags & INP_INHASHLIST)
 		in_pcbremhash(inp);
 
 	RO_INVALIDATE_CACHE(&inp->inp_route);
 #ifdef MAC
 	mac_inpcb_destroy(inp);
 #endif
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (inp->inp_sp != NULL)
 		ipsec_delete_pcbpolicy(inp);
 #endif
 #ifdef INET
 	if (inp->inp_options)
 		(void)m_free(inp->inp_options);
 	imo = inp->inp_moptions;
 #endif
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6PROTO) {
 		ip6_freepcbopts(inp->in6p_outputopts);
 		im6o = inp->in6p_moptions;
 	} else
 		im6o = NULL;
 #endif
 
 	if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
 		INP_WUNLOCK(inp);
 	}
 #ifdef INET6
 	ip6_freemoptions(im6o);
 #endif
 #ifdef INET
 	inp_freemoptions(imo);
 #endif
 	/* Destruction is finalized in inpcb_dtor(). */
 }
 
 static void
 inpcb_dtor(void *mem, int size, void *arg)
 {
 	struct inpcb *inp = mem;
 
 	crfree(inp->inp_cred);
 #ifdef INVARIANTS
 	inp->inp_cred = NULL;
 #endif
 }
 
 /*
  * Different protocols initialize their inpcbs differently - giving
  * different name to the lock.  But they all are disposed the same.
  */
 static void
 inpcb_fini(void *mem, int size)
 {
 	struct inpcb *inp = mem;
 
 	INP_LOCK_DESTROY(inp);
 }
 
 /*
  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
  * port reservation, and preventing it from being returned by inpcb lookups.
  *
  * It is used by TCP to mark an inpcb as unused and avoid future packet
  * delivery or event notification when a socket remains open but TCP has
  * closed.  This might occur as a result of a shutdown()-initiated TCP close
  * or a RST on the wire, and allows the port binding to be reused while still
  * maintaining the invariant that so_pcb always points to a valid inpcb until
  * in_pcbdetach().
  *
  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
  * in_pcbnotifyall() and in_pcbpurgeif0()?
  */
 void
 in_pcbdrop(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 #ifdef INVARIANTS
 	if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
 		MPASS(inp->inp_refcount > 1);
 #endif
 
 	inp->inp_flags |= INP_DROPPED;
 	if (inp->inp_flags & INP_INHASHLIST)
 		in_pcbremhash(inp);
 }
 
 #ifdef INET
 /*
  * Common routines to return the socket addresses associated with inpcbs.
  */
 struct sockaddr *
 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 int
 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_lport;
 	addr = inp->inp_laddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 int
 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp;
 	struct in_addr addr;
 	in_port_t port;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 
 	INP_RLOCK(inp);
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 	INP_RUNLOCK(inp);
 
 	*nam = in_sockaddr(port, &addr);
 	return 0;
 }
 
 void
 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
     struct inpcb *(*notify)(struct inpcb *, int))
 {
 	struct inpcb *inp, *inp_temp;
 
 	INP_INFO_WLOCK(pcbinfo);
 	CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) {
 		INP_WLOCK(inp);
 #ifdef INET6
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 #endif
 		if (inp->inp_faddr.s_addr != faddr.s_addr ||
 		    inp->inp_socket == NULL) {
 			INP_WUNLOCK(inp);
 			continue;
 		}
 		if ((*notify)(inp, errno))
 			INP_WUNLOCK(inp);
 	}
 	INP_INFO_WUNLOCK(pcbinfo);
 }
 
 static bool
 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
 {
 
 	if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
 		return (true);
 	else
 		return (false);
 }
 
 void
 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 {
 	struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
 	    inp_v4_multi_match, NULL);
 	struct inpcb *inp;
 	struct in_multi *inm;
 	struct in_mfilter *imf;
 	struct ip_moptions *imo;
 
 	IN_MULTI_LOCK_ASSERT();
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		INP_WLOCK_ASSERT(inp);
 
 		imo = inp->inp_moptions;
 		/*
 		 * Unselect the outgoing interface if it is being
 		 * detached.
 		 */
 		if (imo->imo_multicast_ifp == ifp)
 			imo->imo_multicast_ifp = NULL;
 
 		/*
 		 * Drop multicast group membership if we joined
 		 * through the interface being detached.
 		 *
 		 * XXX This can all be deferred to an epoch_call
 		 */
 restart:
 		IP_MFILTER_FOREACH(imf, &imo->imo_head) {
 			if ((inm = imf->imf_inm) == NULL)
 				continue;
 			if (inm->inm_ifp != ifp)
 				continue;
 			ip_mfilter_remove(&imo->imo_head, imf);
 			in_leavegroup_locked(inm, NULL);
 			ip_mfilter_free(imf);
 			goto restart;
 		}
 	}
 }
 
 /*
  * Lookup a PCB based on the local address and port.  Caller must hold the
  * hash lock.  No inpcb locks or references are acquired.
  */
 #define INP_LOOKUP_MAPPED_PCB_COST	3
 struct inpcb *
 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
     u_short lport, int lookupflags, struct ucred *cred)
 {
 	struct inpcb *inp;
 #ifdef INET6
 	int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 #else
 	int matchwild = 3;
 #endif
 	int wildcard;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 		struct inpcbhead *head;
 		/*
 		 * Look for an unconnected (wildcard foreign addr) PCB that
 		 * matches the local address and port we're looking for.
 		 */
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
 		    pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr == INADDR_ANY &&
 			    inp->inp_laddr.s_addr == laddr.s_addr &&
 			    inp->inp_lport == lport) {
 				/*
 				 * Found?
 				 */
 				if (prison_equal_ip4(cred->cr_prison,
 				    inp->inp_cred->cr_prison))
 					return (inp);
 			}
 		}
 		/*
 		 * Not found.
 		 */
 		return (NULL);
 	} else {
 		struct inpcbporthead *porthash;
 		struct inpcbport *phd;
 		struct inpcb *match = NULL;
 		/*
 		 * Best fit PCB lookup.
 		 *
 		 * First see if this local port is in use by looking on the
 		 * port hash list.
 		 */
 		porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 		    pcbinfo->ipi_porthashmask)];
 		CK_LIST_FOREACH(phd, porthash, phd_hash) {
 			if (phd->phd_port == lport)
 				break;
 		}
 		if (phd != NULL) {
 			/*
 			 * Port is in use by one or more PCBs. Look for best
 			 * fit.
 			 */
 			CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 				wildcard = 0;
 				if (!prison_equal_ip4(inp->inp_cred->cr_prison,
 				    cred->cr_prison))
 					continue;
 #ifdef INET6
 				/* XXX inp locking */
 				if ((inp->inp_vflag & INP_IPV4) == 0)
 					continue;
 				/*
 				 * We never select the PCB that has
 				 * INP_IPV6 flag and is bound to :: if
 				 * we have another PCB which is bound
 				 * to 0.0.0.0.  If a PCB has the
 				 * INP_IPV6 flag, then we set its cost
 				 * higher than IPv4 only PCBs.
 				 *
 				 * Note that the case only happens
 				 * when a socket is bound to ::, under
 				 * the condition that the use of the
 				 * mapped address is allowed.
 				 */
 				if ((inp->inp_vflag & INP_IPV6) != 0)
 					wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 #endif
 				if (inp->inp_faddr.s_addr != INADDR_ANY)
 					wildcard++;
 				if (inp->inp_laddr.s_addr != INADDR_ANY) {
 					if (laddr.s_addr == INADDR_ANY)
 						wildcard++;
 					else if (inp->inp_laddr.s_addr != laddr.s_addr)
 						continue;
 				} else {
 					if (laddr.s_addr != INADDR_ANY)
 						wildcard++;
 				}
 				if (wildcard < matchwild) {
 					match = inp;
 					matchwild = wildcard;
 					if (matchwild == 0)
 						break;
 				}
 			}
 		}
 		return (match);
 	}
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
 static bool
 in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain)
 {
 	return (domain == M_NODOM || domain == grp->il_numa_domain);
 }
 
 static struct inpcb *
 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
     const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
     uint16_t fport, int lookupflags, int domain)
 {
 	const struct inpcblbgrouphead *hdr;
 	struct inpcblbgroup *grp;
 	struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
 
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	hdr = &pcbinfo->ipi_lbgrouphashbase[
 	    INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
 
 	/*
 	 * Search for an LB group match based on the following criteria:
 	 * - prefer jailed groups to non-jailed groups
 	 * - prefer exact source address matches to wildcard matches
 	 * - prefer groups bound to the specified NUMA domain
 	 */
 	jail_exact = jail_wild = local_exact = local_wild = NULL;
 	CK_LIST_FOREACH(grp, hdr, il_list) {
 		bool injail;
 
 #ifdef INET6
 		if (!(grp->il_vflag & INP_IPV4))
 			continue;
 #endif
 		if (grp->il_lport != lport)
 			continue;
 
 		injail = prison_flag(grp->il_cred, PR_IP4) != 0;
 		if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
 		    laddr) != 0)
 			continue;
 
 		if (grp->il_laddr.s_addr == laddr->s_addr) {
 			if (injail) {
 				jail_exact = grp;
 				if (in_pcblookup_lb_numa_match(grp, domain))
 					/* This is a perfect match. */
 					goto out;
 			} else if (local_exact == NULL ||
 			    in_pcblookup_lb_numa_match(grp, domain)) {
 				local_exact = grp;
 			}
 		} else if (grp->il_laddr.s_addr == INADDR_ANY &&
 		    (lookupflags & INPLOOKUP_WILDCARD) != 0) {
 			if (injail) {
 				if (jail_wild == NULL ||
 				    in_pcblookup_lb_numa_match(grp, domain))
 					jail_wild = grp;
 			} else if (local_wild == NULL ||
 			    in_pcblookup_lb_numa_match(grp, domain)) {
 				local_wild = grp;
 			}
 		}
 	}
 
 	if (jail_exact != NULL)
 		grp = jail_exact;
 	else if (jail_wild != NULL)
 		grp = jail_wild;
 	else if (local_exact != NULL)
 		grp = local_exact;
 	else
 		grp = local_wild;
 	if (grp == NULL)
 		return (NULL);
 out:
 	return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
 	    grp->il_inpcnt]);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
  * that the caller has either locked the hash list, which usually happens
  * for bind(2) operations, or is in SMR section, which happens when sorting
  * out incoming packets.
  */
 static struct inpcb *
 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
     struct ifnet *ifp, uint8_t numa_domain)
 {
 	struct inpcbhead *head;
 	struct inpcb *inp, *tmpinp;
 	u_short fport = fport_arg, lport = lport_arg;
 
 	KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	INP_HASH_LOCK_ASSERT(pcbinfo);
 
 	/*
 	 * First look for an exact match.
 	 */
 	tmpinp = NULL;
 	head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport,
 	    pcbinfo->ipi_hashmask)];
 	CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 		/* XXX inp locking */
 		if ((inp->inp_vflag & INP_IPV4) == 0)
 			continue;
 #endif
 		if (inp->inp_faddr.s_addr == faddr.s_addr &&
 		    inp->inp_laddr.s_addr == laddr.s_addr &&
 		    inp->inp_fport == fport &&
 		    inp->inp_lport == lport) {
 			/*
 			 * XXX We should be able to directly return
 			 * the inp here, without any checks.
 			 * Well unless both bound with SO_REUSEPORT?
 			 */
 			if (prison_flag(inp->inp_cred, PR_IP4))
 				return (inp);
 			if (tmpinp == NULL)
 				tmpinp = inp;
 		}
 	}
 	if (tmpinp != NULL)
 		return (tmpinp);
 
 	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 		struct inpcb *local_wild = NULL, *local_exact = NULL;
 #ifdef INET6
 		struct inpcb *local_wild_mapped = NULL;
 #endif
 		struct inpcb *jail_wild = NULL;
 		int injail;
 
 		/*
 		 * First see if an LB group matches the request before scanning
 		 * all sockets on this port.
 		 */
 		inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
 		    fport, lookupflags, numa_domain);
 		if (inp != NULL)
 			return (inp);
 
 		/*
 		 * Order of socket selection - we always prefer jails.
 		 *      1. jailed, non-wild.
 		 *      2. jailed, wild.
 		 *      3. non-jailed, non-wild.
 		 *      4. non-jailed, wild.
 		 */
 
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
 		    pcbinfo->ipi_hashmask)];
 		CK_LIST_FOREACH(inp, head, inp_hash) {
 #ifdef INET6
 			/* XXX inp locking */
 			if ((inp->inp_vflag & INP_IPV4) == 0)
 				continue;
 #endif
 			if (inp->inp_faddr.s_addr != INADDR_ANY ||
 			    inp->inp_lport != lport)
 				continue;
 
 			injail = prison_flag(inp->inp_cred, PR_IP4);
 			if (injail) {
 				if (prison_check_ip4_locked(
 				    inp->inp_cred->cr_prison, &laddr) != 0)
 					continue;
 			} else {
 				if (local_exact != NULL)
 					continue;
 			}
 
 			if (inp->inp_laddr.s_addr == laddr.s_addr) {
 				if (injail)
 					return (inp);
 				else
 					local_exact = inp;
 			} else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 #ifdef INET6
 				/* XXX inp locking, NULL check */
 				if (inp->inp_vflag & INP_IPV6PROTO)
 					local_wild_mapped = inp;
 				else
 #endif
 					if (injail)
 						jail_wild = inp;
 					else
 						local_wild = inp;
 			}
 		} /* LIST_FOREACH */
 		if (jail_wild != NULL)
 			return (jail_wild);
 		if (local_exact != NULL)
 			return (local_exact);
 		if (local_wild != NULL)
 			return (local_wild);
 #ifdef INET6
 		if (local_wild_mapped != NULL)
 			return (local_wild_mapped);
 #endif
 	} /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 
 	return (NULL);
 }
 
 /*
  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
  * hash list lock, and will return the inpcb locked (i.e., requires
  * INPLOOKUP_LOCKPCB).
  */
 static struct inpcb *
 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, uint8_t numa_domain)
 {
 	struct inpcb *inp;
 
 	smr_enter(pcbinfo->ipi_smr);
 	inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
 	if (inp != NULL) {
 		if (__predict_false(inp_smr_lock(inp,
 		    (lookupflags & INPLOOKUP_LOCKMASK)) == false))
 			inp = NULL;
 	} else
 		smr_exit(pcbinfo->ipi_smr);
 
 	return (inp);
 }
 
 /*
  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
  * from which a pre-calculated hash value may be extracted.
  */
 struct inpcb *
 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 {
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp, M_NODOM));
 }
 
 struct inpcb *
 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
     struct ifnet *ifp, struct mbuf *m)
 {
 
 	KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 	    ("%s: invalid lookup flags %d", __func__, lookupflags));
 	KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 	    ("%s: LOCKPCB not set", __func__));
 
 	return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 	    lookupflags, ifp, m->m_pkthdr.numa_domain));
 }
 #endif /* INET */
 
 /*
  * Insert PCB onto various hash lists.
  */
 int
 in_pcbinshash(struct inpcb *inp)
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
 	    ("in_pcbinshash: INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
 		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 	else
 #endif
 		pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
 		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
 	/*
 	 * Add entry to load balance group.
 	 * Only do this if SO_REUSEPORT_LB is set.
 	 */
 	if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) {
 		int error = in_pcbinslbgrouphash(inp, M_NODOM);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Go through port list and look for a head for this lport.
 	 */
 	CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
 		if (phd->phd_port == inp->inp_lport)
 			break;
 	}
 
 	/*
 	 * If none exists, malloc one and tack it on.
 	 */
 	if (phd == NULL) {
 		phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
 		if (phd == NULL) {
 			if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
 				in_pcbremlbgrouphash(inp);
 			return (ENOMEM);
 		}
 		phd->phd_port = inp->inp_lport;
 		CK_LIST_INIT(&phd->phd_pcblist);
 		CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 	}
 	inp->inp_phd = phd;
 	CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	inp->inp_flags |= INP_INHASHLIST;
 
 	return (0);
 }
 
 static void
 in_pcbremhash(struct inpcb *inp)
 {
 	struct inpcbport *phd = inp->inp_phd;
 
 	INP_WLOCK_ASSERT(inp);
 	MPASS(inp->inp_flags & INP_INHASHLIST);
 
 	INP_HASH_WLOCK(inp->inp_pcbinfo);
 	if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
 		in_pcbremlbgrouphash(inp);
 	CK_LIST_REMOVE(inp, inp_hash);
 	CK_LIST_REMOVE(inp, inp_portlist);
 	if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 		CK_LIST_REMOVE(phd, phd_hash);
 		uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
 	}
 	INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 	inp->inp_flags &= ~INP_INHASHLIST;
 }
 
 /*
  * Move PCB to the proper hash bucket when { faddr, fport } have  been
  * changed. NOTE: This does not handle the case of the lport changing (the
  * hashed port list would have to be updated as well), so the lport must
  * not change after in_pcbinshash() has been called.
  *
  * XXXGL: a race between this function and SMR-protected hash iterator
  * will lead to iterator traversing a possibly wrong hash list. However,
  * this race should have been here since change from rwlock to epoch.
  */
 void
 in_pcbrehash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *head;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
 	KASSERT(inp->inp_flags & INP_INHASHLIST,
 	    ("in_pcbrehash: !INP_INHASHLIST"));
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
 		head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
 		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 	else
 #endif
 		head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
 		    inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 
 	CK_LIST_REMOVE(inp, inp_hash);
 	CK_LIST_INSERT_HEAD(head, inp, inp_hash);
 }
 
 /*
  * Check for alternatives when higher level complains
  * about service problems.  For now, invalidate cached
  * routing information.  If the route was created dynamically
  * (by a redirect), time to try a default gateway again.
  */
 void
 in_losing(struct inpcb *inp)
 {
 
 	RO_INVALIDATE_CACHE(&inp->inp_route);
 	return;
 }
 
 /*
  * A set label operation has occurred at the socket layer, propagate the
  * label change into the in_pcb for the socket.
  */
 void
 in_pcbsosetlabel(struct socket *so)
 {
 #ifdef MAC
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 
 	INP_WLOCK(inp);
 	SOCK_LOCK(so);
 	mac_inpcb_sosetlabel(so, inp);
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 #endif
 }
 
 void
 inp_wlock(struct inpcb *inp)
 {
 
 	INP_WLOCK(inp);
 }
 
 void
 inp_wunlock(struct inpcb *inp)
 {
 
 	INP_WUNLOCK(inp);
 }
 
 void
 inp_rlock(struct inpcb *inp)
 {
 
 	INP_RLOCK(inp);
 }
 
 void
 inp_runlock(struct inpcb *inp)
 {
 
 	INP_RUNLOCK(inp);
 }
 
 #ifdef INVARIANT_SUPPORT
 void
 inp_lock_assert(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 }
 
 void
 inp_unlock_assert(struct inpcb *inp)
 {
 
 	INP_UNLOCK_ASSERT(inp);
 }
 #endif
 
 void
 inp_apply_all(struct inpcbinfo *pcbinfo,
     void (*func)(struct inpcb *, void *), void *arg)
 {
 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
 	    INPLOOKUP_WLOCKPCB);
 	struct inpcb *inp;
 
 	while ((inp = inp_next(&inpi)) != NULL)
 		func(inp, arg);
 }
 
 struct socket *
 inp_inpcbtosocket(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return (inp->inp_socket);
 }
 
 struct tcpcb *
 inp_inpcbtotcpcb(struct inpcb *inp)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	return ((struct tcpcb *)inp->inp_ppcb);
 }
 
 int
 inp_ip_tos_get(const struct inpcb *inp)
 {
 
 	return (inp->inp_ip_tos);
 }
 
 void
 inp_ip_tos_set(struct inpcb *inp, int val)
 {
 
 	inp->inp_ip_tos = val;
 }
 
 void
 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
     uint32_t *faddr, uint16_t *fp)
 {
 
 	INP_LOCK_ASSERT(inp);
 	*laddr = inp->inp_laddr.s_addr;
 	*faddr = inp->inp_faddr.s_addr;
 	*lp = inp->inp_lport;
 	*fp = inp->inp_fport;
 }
 
 struct inpcb *
 so_sotoinpcb(struct socket *so)
 {
 
 	return (sotoinpcb(so));
 }
 
 /*
  * Create an external-format (``xinpcb'') structure using the information in
  * the kernel-format in_pcb structure pointed to by inp.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
  * user code from changes in the kernel structure, and potentially to provide
  * information-hiding if we decide that some of this information should be
  * hidden from users.
  */
 void
 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
 {
 
 	bzero(xi, sizeof(*xi));
 	xi->xi_len = sizeof(struct xinpcb);
 	if (inp->inp_socket)
 		sotoxsocket(inp->inp_socket, &xi->xi_socket);
 	bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
 	xi->inp_gencnt = inp->inp_gencnt;
 	xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
 	xi->inp_flow = inp->inp_flow;
 	xi->inp_flowid = inp->inp_flowid;
 	xi->inp_flowtype = inp->inp_flowtype;
 	xi->inp_flags = inp->inp_flags;
 	xi->inp_flags2 = inp->inp_flags2;
 	xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
 	xi->in6p_cksum = inp->in6p_cksum;
 	xi->in6p_hops = inp->in6p_hops;
 	xi->inp_ip_tos = inp->inp_ip_tos;
 	xi->inp_vflag = inp->inp_vflag;
 	xi->inp_ip_ttl = inp->inp_ip_ttl;
 	xi->inp_ip_p = inp->inp_ip_p;
 	xi->inp_ip_minttl = inp->inp_ip_minttl;
 }
 
 int
 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
     int (*ctloutput_set)(struct inpcb *, struct sockopt *))
 {
 	struct sockopt sopt;
 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
 	    INPLOOKUP_WLOCKPCB);
 	struct inpcb *inp;
 	struct sockopt_parameters *params;
 	struct socket *so;
 	int error;
 	char buf[1024];
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen > sizeof(buf))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, buf, req->newlen);
 	if (error != 0)
 		return (error);
 	if (req->newlen < sizeof(struct sockopt_parameters))
 		return (EINVAL);
 	params = (struct sockopt_parameters *)buf;
 	sopt.sopt_level = params->sop_level;
 	sopt.sopt_name = params->sop_optname;
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_val = params->sop_optval;
 	sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
 	sopt.sopt_td = NULL;
 #ifdef INET6
 	if (params->sop_inc.inc_flags & INC_ISIPV6) {
 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
 			params->sop_inc.inc6_laddr.s6_addr16[1] =
 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
 		if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
 			params->sop_inc.inc6_faddr.s6_addr16[1] =
 			    htons(params->sop_inc.inc6_zoneid & 0xffff);
 	}
 #endif
 	if (params->sop_inc.inc_lport != htons(0)) {
 		if (params->sop_inc.inc_fport == htons(0))
 			inpi.hash = INP_PCBHASH_WILD(params->sop_inc.inc_lport,
 			    pcbinfo->ipi_hashmask);
 		else
 #ifdef INET6
 			if (params->sop_inc.inc_flags & INC_ISIPV6)
 				inpi.hash = INP6_PCBHASH(
 				    &params->sop_inc.inc6_faddr,
 				    params->sop_inc.inc_lport,
 				    params->sop_inc.inc_fport,
 				    pcbinfo->ipi_hashmask);
 			else
 #endif
 				inpi.hash = INP_PCBHASH(
 				    &params->sop_inc.inc_faddr,
 				    params->sop_inc.inc_lport,
 				    params->sop_inc.inc_fport,
 				    pcbinfo->ipi_hashmask);
 	}
 	while ((inp = inp_next(&inpi)) != NULL)
 		if (inp->inp_gencnt == params->sop_id) {
 			if (inp->inp_flags & INP_DROPPED) {
 				INP_WUNLOCK(inp);
 				return (ECONNRESET);
 			}
 			so = inp->inp_socket;
 			KASSERT(so != NULL, ("inp_socket == NULL"));
 			soref(so);
 			error = (*ctloutput_set)(inp, &sopt);
 			sorele(so);
 			break;
 		}
 	if (inp == NULL)
 		error = ESRCH;
 	return (error);
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 {
 	char faddr_str[48], laddr_str[48];
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inc);
 
 	indent += 2;
 
 #ifdef INET6
 	if (inc->inc_flags & INC_ISIPV6) {
 		/* IPv6. */
 		ip6_sprintf(laddr_str, &inc->inc6_laddr);
 		ip6_sprintf(faddr_str, &inc->inc6_faddr);
 	} else
 #endif
 	{
 		/* IPv4. */
 		inet_ntoa_r(inc->inc_laddr, laddr_str);
 		inet_ntoa_r(inc->inc_faddr, faddr_str);
 	}
 	db_print_indent(indent);
 	db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 	    ntohs(inc->inc_lport));
 	db_print_indent(indent);
 	db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 	    ntohs(inc->inc_fport));
 }
 
 static void
 db_print_inpflags(int inp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_flags & INP_RECVOPTS) {
 		db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVRETOPTS) {
 		db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVDSTADDR) {
 		db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ORIGDSTADDR) {
 		db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HDRINCL) {
 		db_printf("%sINP_HDRINCL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_HIGHPORT) {
 		db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_LOWPORT) {
 		db_printf("%sINP_LOWPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ANONPORT) {
 		db_printf("%sINP_ANONPORT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVIF) {
 		db_printf("%sINP_RECVIF", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_MTUDISC) {
 		db_printf("%sINP_MTUDISC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTTL) {
 		db_printf("%sINP_RECVTTL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_DONTFRAG) {
 		db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_RECVTOS) {
 		db_printf("%sINP_RECVTOS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_IPV6_V6ONLY) {
 		db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_PKTINFO) {
 		db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPLIMIT) {
 		db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_HOPOPTS) {
 		db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_DSTOPTS) {
 		db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDR) {
 		db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_RTHDRDSTOPTS) {
 		db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_TCLASS) {
 		db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_AUTOFLOWLABEL) {
 		db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & INP_ONESBCAST) {
 		db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_DROPPED) {
 		db_printf("%sINP_DROPPED", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & INP_SOCKREF) {
 		db_printf("%sINP_SOCKREF", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_flags & IN6P_RFC2292) {
 		db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 		comma = 1;
 	}
 	if (inp_flags & IN6P_MTU) {
 		db_printf("IN6P_MTU%s", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_inpvflag(u_char inp_vflag)
 {
 	int comma;
 
 	comma = 0;
 	if (inp_vflag & INP_IPV4) {
 		db_printf("%sINP_IPV4", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6) {
 		db_printf("%sINP_IPV6", comma ? ", " : "");
 		comma  = 1;
 	}
 	if (inp_vflag & INP_IPV6PROTO) {
 		db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 		comma  = 1;
 	}
 }
 
 static void
 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, inp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 
 	db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 
 	db_print_indent(indent);
 	db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 	    inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 
 	db_print_indent(indent);
 	db_printf("inp_label: %p   inp_flags: 0x%x (",
 	   inp->inp_label, inp->inp_flags);
 	db_print_inpflags(inp->inp_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 	    inp->inp_vflag);
 	db_print_inpvflag(inp->inp_vflag);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 	    inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 
 	db_print_indent(indent);
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		db_printf("in6p_options: %p   in6p_outputopts: %p   "
 		    "in6p_moptions: %p\n", inp->in6p_options,
 		    inp->in6p_outputopts, inp->in6p_moptions);
 		db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 		    "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 		    inp->in6p_hops);
 	} else
 #endif
 	{
 		db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 		    "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 		    inp->inp_options, inp->inp_moptions);
 	}
 
 	db_print_indent(indent);
 	db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 	    (uintmax_t)inp->inp_gencnt);
 }
 
 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 {
 	struct inpcb *inp;
 
 	if (!have_addr) {
 		db_printf("usage: show inpcb <addr>\n");
 		return;
 	}
 	inp = (struct inpcb *)addr;
 
 	db_print_inpcb(inp, "inpcb", 0);
 }
 #endif /* DDB */
 
 #ifdef RATELIMIT
 /*
  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
  * if any.
  */
 int
 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
 {
 	union if_snd_tag_modify_params params = {
 		.rate_limit.max_rate = max_pacing_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	struct m_snd_tag *mst;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	if (mst->sw->snd_tag_modify == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = mst->sw->snd_tag_modify(mst, &params);
 	}
 	return (error);
 }
 
 /*
  * Query existing TX rate limit based on the existing
  * "inp->inp_snd_tag", if any.
  */
 int
 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
 {
 	union if_snd_tag_query_params params = { };
 	struct m_snd_tag *mst;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	if (mst->sw->snd_tag_query == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = mst->sw->snd_tag_query(mst, &params);
 		if (error == 0 && p_max_pacing_rate != NULL)
 			*p_max_pacing_rate = params.rate_limit.max_rate;
 	}
 	return (error);
 }
 
 /*
  * Query existing TX queue level based on the existing
  * "inp->inp_snd_tag", if any.
  */
 int
 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
 {
 	union if_snd_tag_query_params params = { };
 	struct m_snd_tag *mst;
 	int error;
 
 	mst = inp->inp_snd_tag;
 	if (mst == NULL)
 		return (EINVAL);
 
 	if (mst->sw->snd_tag_query == NULL)
 		return (EOPNOTSUPP);
 
 	error = mst->sw->snd_tag_query(mst, &params);
 	if (error == 0 && p_txqueue_level != NULL)
 		*p_txqueue_level = params.rate_limit.queue_level;
 	return (error);
 }
 
 /*
  * Allocate a new TX rate limit send tag from the network interface
  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
  */
 int
 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
 
 {
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
 		    IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
 		.rate_limit.hdr.flowid = flowid,
 		.rate_limit.hdr.flowtype = flowtype,
 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
 		.rate_limit.max_rate = max_pacing_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	int error;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If there is already a send tag, or the INP is being torn
 	 * down, allocating a new send tag is not allowed. Else send
 	 * tags may leak.
 	 */
 	if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0)
 		return (EINVAL);
 
 	error = m_snd_tag_alloc(ifp, &params, st);
 #ifdef INET
 	if (error == 0) {
 		counter_u64_add(rate_limit_set_ok, 1);
 		counter_u64_add(rate_limit_active, 1);
 	} else if (error != EOPNOTSUPP)
 		  counter_u64_add(rate_limit_alloc_fail, 1);
 #endif
 	return (error);
 }
 
 void
 in_pcbdetach_tag(struct m_snd_tag *mst)
 {
 
 	m_snd_tag_rele(mst);
 #ifdef INET
 	counter_u64_add(rate_limit_active, -1);
 #endif
 }
 
 /*
  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
  * if any:
  */
 void
 in_pcbdetach_txrtlmt(struct inpcb *inp)
 {
 	struct m_snd_tag *mst;
 
 	INP_WLOCK_ASSERT(inp);
 
 	mst = inp->inp_snd_tag;
 	inp->inp_snd_tag = NULL;
 
 	if (mst == NULL)
 		return;
 
 	m_snd_tag_rele(mst);
 #ifdef INET
 	counter_u64_add(rate_limit_active, -1);
 #endif
 }
 
 int
 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
 {
 	int error;
 
 	/*
 	 * If the existing send tag is for the wrong interface due to
 	 * a route change, first drop the existing tag.  Set the
 	 * CHANGED flag so that we will keep trying to allocate a new
 	 * tag if we fail to allocate one this time.
 	 */
 	if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
 		in_pcbdetach_txrtlmt(inp);
 		inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 	}
 
 	/*
 	 * NOTE: When attaching to a network interface a reference is
 	 * made to ensure the network interface doesn't go away until
 	 * all ratelimit connections are gone. The network interface
 	 * pointers compared below represent valid network interfaces,
 	 * except when comparing towards NULL.
 	 */
 	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
 		error = 0;
 	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
 		if (inp->inp_snd_tag != NULL)
 			in_pcbdetach_txrtlmt(inp);
 		error = 0;
 	} else if (inp->inp_snd_tag == NULL) {
 		/*
 		 * In order to utilize packet pacing with RSS, we need
 		 * to wait until there is a valid RSS hash before we
 		 * can proceed:
 		 */
 		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
 			error = EAGAIN;
 		} else {
 			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
 			    mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
 		}
 	} else {
 		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
 	}
 	if (error == 0 || error == EOPNOTSUPP)
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 
 	return (error);
 }
 
 /*
  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
  * is set in the fast path and will attach/detach/modify the TX rate
  * limit send tag based on the socket's so_max_pacing_rate value.
  */
 void
 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
 {
 	struct socket *socket;
 	uint32_t max_pacing_rate;
 	bool did_upgrade;
 
 	if (inp == NULL)
 		return;
 
 	socket = inp->inp_socket;
 	if (socket == NULL)
 		return;
 
 	if (!INP_WLOCKED(inp)) {
 		/*
 		 * NOTE: If the write locking fails, we need to bail
 		 * out and use the non-ratelimited ring for the
 		 * transmit until there is a new chance to get the
 		 * write lock.
 		 */
 		if (!INP_TRY_UPGRADE(inp))
 			return;
 		did_upgrade = 1;
 	} else {
 		did_upgrade = 0;
 	}
 
 	/*
 	 * NOTE: The so_max_pacing_rate value is read unlocked,
 	 * because atomic updates are not required since the variable
 	 * is checked at every mbuf we send. It is assumed that the
 	 * variable read itself will be atomic.
 	 */
 	max_pacing_rate = socket->so_max_pacing_rate;
 
 	in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
 
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
 
 /*
  * Track route changes for TX rate limiting.
  */
 void
 in_pcboutput_eagain(struct inpcb *inp)
 {
 	bool did_upgrade;
 
 	if (inp == NULL)
 		return;
 
 	if (inp->inp_snd_tag == NULL)
 		return;
 
 	if (!INP_WLOCKED(inp)) {
 		/*
 		 * NOTE: If the write locking fails, we need to bail
 		 * out and use the non-ratelimited ring for the
 		 * transmit until there is a new chance to get the
 		 * write lock.
 		 */
 		if (!INP_TRY_UPGRADE(inp))
 			return;
 		did_upgrade = 1;
 	} else {
 		did_upgrade = 0;
 	}
 
 	/* detach rate limiting */
 	in_pcbdetach_txrtlmt(inp);
 
 	/* make sure new mbuf send tag allocation is made */
 	inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
 
 #ifdef INET
 static void
 rl_init(void *st)
 {
 	rate_limit_new = counter_u64_alloc(M_WAITOK);
 	rate_limit_chg = counter_u64_alloc(M_WAITOK);
 	rate_limit_active = counter_u64_alloc(M_WAITOK);
 	rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
 	rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
 }
 
 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
 #endif
 #endif /* RATELIMIT */
diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c
index 623e788eec91..8c3974e15ba3 100644
--- a/sys/netinet/in_rmx.c
+++ b/sys/netinet/in_rmx.c
@@ -1,184 +1,185 @@
 /*-
  * Copyright 1994, 1995 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
 #include <sys/mbuf.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 
 static int
 rib4_set_nh_pfxflags(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask,
     struct nhop_object *nh)
 {
 	const struct sockaddr_in *addr4 = (const struct sockaddr_in *)addr;
 	const struct sockaddr_in *mask4 = (const struct sockaddr_in *)mask;
 	bool is_broadcast = false;
 
 	if (mask == NULL) {
 		nhop_set_pxtype_flag(nh, NHF_HOST);
 		/*
 		 * Backward compatibility:
 		 * if the destination is broadcast,
 		 * mark route as broadcast.
 		 * This behavior was useful when route cloning
 		 * was in place, so there was an explicit cloned
 		 * route for every broadcasted address.
 		 * Currently (2020-04) there is no kernel machinery
 		 * to do route cloning, though someone might explicitly
 		 * add these routes to support some cases with active-active
 		 * load balancing. Given that, retain this support.
 		 */
 		if (in_broadcast(addr4->sin_addr, nh->nh_ifp))
 			is_broadcast = true;
 	} else if (mask4->sin_addr.s_addr == 0)
 		nhop_set_pxtype_flag(nh, NHF_DEFAULT);
 	else
 		nhop_set_pxtype_flag(nh, 0);
 
 	nhop_set_broadcast(nh, is_broadcast);
 
 	return (0);
 }
 
 static int
 rib4_augment_nh(u_int fibnum, struct nhop_object *nh)
 {
 	/*
 	 * Check route MTU:
 	 * inherit interface MTU if not set or
 	 * check if MTU is too large.
 	 */
 	if (nh->nh_mtu == 0) {
 		nh->nh_mtu = nh->nh_ifp->if_mtu;
 	} else if (nh->nh_mtu > nh->nh_ifp->if_mtu)
 		nh->nh_mtu = nh->nh_ifp->if_mtu;
 
 	/* Set nhop type to basic per-AF nhop */
 	if (nhop_get_type(nh) == 0) {
 		uint16_t nh_type;
 		if (nh->nh_flags & NHF_GATEWAY)
 			nh_type = NH_TYPE_IPV4_ETHER_NHOP;
 		else
 			nh_type = NH_TYPE_IPV4_ETHER_RSLV;
 
 		nhop_set_type(nh, nh_type);
 	}
 
 	return (0);
 }
 
 /*
  * Initialize our routing tree.
  */
 struct rib_head *
 in_inithead(uint32_t fibnum)
 {
 	struct rib_head *rh;
 
 	rh = rt_table_init(32, AF_INET, fibnum);
 	if (rh == NULL)
 		return (NULL);
 
 	rh->rnh_set_nh_pfxflags = rib4_set_nh_pfxflags;
 	rh->rnh_augment_nh = rib4_augment_nh;
 
 	return (rh);
 }
 
 #ifdef VIMAGE
 void
 in_detachhead(struct rib_head *rh)
 {
 
 	rt_table_destroy(rh);
 }
 #endif
 
 /*
  * This zaps old routes when the interface goes down or interface
  * address is deleted.  In the latter case, it deletes static routes
  * that point to this address.  If we don't do this, we may end up
  * using the old address in the future.  The ones we always want to
  * get rid of are things like ARP entries, since the user might down
  * the interface, walk over to a completely different network, and
  * plug back in.
  */
 struct in_ifadown_arg {
 	struct ifaddr *ifa;
 	int del;
 };
 
 static int
 in_ifadownkill(const struct rtentry *rt, const struct nhop_object *nh,
     void *xap)
 {
 	struct in_ifadown_arg *ap = xap;
 
 	if (nh->nh_ifa != ap->ifa)
 		return (0);
 
 	if ((nhop_get_rtflags(nh) & RTF_STATIC) != 0 && ap->del == 0)
 		return (0);
 
 	return (1);
 }
 
 void
 in_ifadown(struct ifaddr *ifa, int delete)
 {
 	struct in_ifadown_arg arg;
 
 	KASSERT(ifa->ifa_addr->sa_family == AF_INET,
 	    ("%s: wrong family", __func__));
 
 	arg.ifa = ifa;
 	arg.del = delete;
 
 	rib_foreach_table_walk_del(AF_INET, in_ifadownkill, &arg);
 	ifa->ifa_flags &= ~IFA_ROUTE;		/* XXXlocking? */
 }
diff --git a/sys/netinet/ip_carp.c b/sys/netinet/ip_carp.c
index 3846576b4482..22b256238774 100644
--- a/sys/netinet/ip_carp.c
+++ b/sys/netinet/ip_carp.c
@@ -1,2278 +1,2279 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002 Michael Shalayeff.
  * Copyright (c) 2003 Ryan McBride.
  * Copyright (c) 2011 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bpf.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/devctl.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/taskqueue.h>
 #include <sys/counter.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip.h>
 #include <machine/in_cksum.h>
 #endif
 #ifdef INET
 #include <netinet/ip_var.h>
 #include <netinet/if_ether.h>
 #endif
 
 #ifdef INET6
 #include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #include <crypto/sha1.h>
 
 static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses");
 
 struct carp_softc {
 	struct ifnet		*sc_carpdev;	/* Pointer to parent ifnet. */
 	struct ifaddr		**sc_ifas;	/* Our ifaddrs. */
 	struct sockaddr_dl	sc_addr;	/* Our link level address. */
 	struct callout		sc_ad_tmo;	/* Advertising timeout. */
 #ifdef INET
 	struct callout		sc_md_tmo;	/* Master down timeout. */
 #endif
 #ifdef INET6
 	struct callout 		sc_md6_tmo;	/* XXX: Master down timeout. */
 #endif
 	struct mtx		sc_mtx;
 
 	int			sc_vhid;
 	int			sc_advskew;
 	int			sc_advbase;
 
 	int			sc_naddrs;
 	int			sc_naddrs6;
 	int			sc_ifasiz;
 	enum { INIT = 0, BACKUP, MASTER }	sc_state;
 	int			sc_suppress;
 	int			sc_sendad_errors;
 #define	CARP_SENDAD_MAX_ERRORS	3
 	int			sc_sendad_success;
 #define	CARP_SENDAD_MIN_SUCCESS 3
 
 	int			sc_init_counter;
 	uint64_t		sc_counter;
 
 	/* authentication */
 #define	CARP_HMAC_PAD	64
 	unsigned char sc_key[CARP_KEY_LEN];
 	unsigned char sc_pad[CARP_HMAC_PAD];
 	SHA1_CTX sc_sha1;
 
 	TAILQ_ENTRY(carp_softc)	sc_list;	/* On the carp_if list. */
 	LIST_ENTRY(carp_softc)	sc_next;	/* On the global list. */
 };
 
 struct carp_if {
 #ifdef INET
 	int	cif_naddrs;
 #endif
 #ifdef INET6
 	int	cif_naddrs6;
 #endif
 	TAILQ_HEAD(, carp_softc) cif_vrs;
 #ifdef INET
 	struct ip_moptions 	 cif_imo;
 #endif
 #ifdef INET6
 	struct ip6_moptions 	 cif_im6o;
 #endif
 	struct ifnet	*cif_ifp;
 	struct mtx	cif_mtx;
 	uint32_t	cif_flags;
 #define	CIF_PROMISC	0x00000001
 };
 
 /*
  * Brief design of carp(4).
  *
  * Any carp-capable ifnet may have a list of carp softcs hanging off
  * its ifp->if_carp pointer. Each softc represents one unique virtual
  * host id, or vhid. The softc has a back pointer to the ifnet. All
  * softcs are joined in a global list, which has quite limited use.
  *
  * Any interface address that takes part in CARP negotiation has a
  * pointer to the softc of its vhid, ifa->ifa_carp. That could be either
  * AF_INET or AF_INET6 address.
  *
  * Although, one can get the softc's backpointer to ifnet and traverse
  * through its ifp->if_addrhead queue to find all interface addresses
  * involved in CARP, we keep a growable array of ifaddr pointers. This
  * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that
  * do calls into the network stack, thus avoiding LORs.
  *
  * Locking:
  *
  * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(),
  * callout-driven events and ioctl()s.
  *
  * To traverse the list of softcs on an ifnet we use CIF_LOCK() or carp_sx.
  * To traverse the global list we use the mutex carp_mtx.
  *
  * Known issues with locking:
  *
  * - Sending ad, we put the pointer to the softc in an mtag, and no reference
  *   counting is done on the softc.
  * - On module unload we may race (?) with packet processing thread
  *   dereferencing our function pointers.
  */
 
 /* Accept incoming CARP packets. */
 VNET_DEFINE_STATIC(int, carp_allow) = 1;
 #define	V_carp_allow	VNET(carp_allow)
 
 /* Set DSCP in outgoing CARP packets. */
 VNET_DEFINE_STATIC(int, carp_dscp) = 56;
 #define	V_carp_dscp	VNET(carp_dscp)
 
 /* Preempt slower nodes. */
 VNET_DEFINE_STATIC(int, carp_preempt) = 0;
 #define	V_carp_preempt	VNET(carp_preempt)
 
 /* Log level. */
 VNET_DEFINE_STATIC(int, carp_log) = 1;
 #define	V_carp_log	VNET(carp_log)
 
 /* Global advskew demotion. */
 VNET_DEFINE_STATIC(int, carp_demotion) = 0;
 #define	V_carp_demotion	VNET(carp_demotion)
 
 /* Send error demotion factor. */
 VNET_DEFINE_STATIC(int, carp_senderr_adj) = CARP_MAXSKEW;
 #define	V_carp_senderr_adj	VNET(carp_senderr_adj)
 
 /* Iface down demotion factor. */
 VNET_DEFINE_STATIC(int, carp_ifdown_adj) = CARP_MAXSKEW;
 #define	V_carp_ifdown_adj	VNET(carp_ifdown_adj)
 
 static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS);
 static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS);
 static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_net_inet, IPPROTO_CARP, carp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "CARP");
 SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
     &VNET_NAME(carp_allow), 0, carp_allow_sysctl, "I",
     "Accept incoming CARP packets");
 SYSCTL_PROC(_net_inet_carp, OID_AUTO, dscp,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, carp_dscp_sysctl, "I",
     "DSCP value for carp packets");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_log), 0, "CARP log level");
 SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, carp_demote_adj_sysctl, "I",
     "Adjust demotion factor (skew of advskew)");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor,
     CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment");
 SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor,
     CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(carp_ifdown_adj), 0,
     "Interface down demotion factor adjustment");
 
 VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats);
 VNET_PCPUSTAT_SYSINIT(carpstats);
 VNET_PCPUSTAT_SYSUNINIT(carpstats);
 
 #define	CARPSTATS_ADD(name, val)	\
     counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \
 	sizeof(uint64_t)], (val))
 #define	CARPSTATS_INC(name)		CARPSTATS_ADD(name, 1)
 
 SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats,
     carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)");
 
 #define	CARP_LOCK_INIT(sc)	mtx_init(&(sc)->sc_mtx, "carp_softc",   \
 	NULL, MTX_DEF)
 #define	CARP_LOCK_DESTROY(sc)	mtx_destroy(&(sc)->sc_mtx)
 #define	CARP_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
 #define	CARP_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
 #define	CARP_UNLOCK(sc)		mtx_unlock(&(sc)->sc_mtx)
 #define	CIF_LOCK_INIT(cif)	mtx_init(&(cif)->cif_mtx, "carp_if",   \
 	NULL, MTX_DEF)
 #define	CIF_LOCK_DESTROY(cif)	mtx_destroy(&(cif)->cif_mtx)
 #define	CIF_LOCK_ASSERT(cif)	mtx_assert(&(cif)->cif_mtx, MA_OWNED)
 #define	CIF_LOCK(cif)		mtx_lock(&(cif)->cif_mtx)
 #define	CIF_UNLOCK(cif)		mtx_unlock(&(cif)->cif_mtx)
 #define	CIF_FREE(cif)	do {				\
 		CIF_LOCK(cif);				\
 		if (TAILQ_EMPTY(&(cif)->cif_vrs))	\
 			carp_free_if(cif);		\
 		else					\
 			CIF_UNLOCK(cif);		\
 } while (0)
 
 #define	CARP_LOG(...)	do {				\
 	if (V_carp_log > 0)				\
 		log(LOG_INFO, "carp: " __VA_ARGS__);	\
 } while (0)
 
 #define	CARP_DEBUG(...)	do {				\
 	if (V_carp_log > 1)				\
 		log(LOG_DEBUG, __VA_ARGS__);		\
 } while (0)
 
 #define	IFNET_FOREACH_IFA(ifp, ifa)					\
 	CK_STAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link)	\
 		if ((ifa)->ifa_carp != NULL)
 
 #define	CARP_FOREACH_IFA(sc, ifa)					\
 	CARP_LOCK_ASSERT(sc);						\
 	for (int _i = 0;						\
 		_i < (sc)->sc_naddrs + (sc)->sc_naddrs6 &&		\
 		((ifa) = sc->sc_ifas[_i]) != NULL;			\
 		++_i)
 
 #define	IFNET_FOREACH_CARP(ifp, sc)					\
 	KASSERT(mtx_owned(&ifp->if_carp->cif_mtx) ||			\
 	    sx_xlocked(&carp_sx), ("cif_vrs not locked"));		\
 	TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list)
 
 #define	DEMOTE_ADVSKEW(sc)					\
     (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ?	\
     CARP_MAXSKEW :						\
         (((sc)->sc_advskew + V_carp_demotion < 0) ?		\
         0 : ((sc)->sc_advskew + V_carp_demotion)))
 
 static void	carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
 static struct carp_softc
 		*carp_alloc(struct ifnet *);
 static void	carp_destroy(struct carp_softc *);
 static struct carp_if
 		*carp_alloc_if(struct ifnet *);
 static void	carp_free_if(struct carp_if *);
 static void	carp_set_state(struct carp_softc *, int, const char* reason);
 static void	carp_sc_state(struct carp_softc *);
 static void	carp_setrun(struct carp_softc *, sa_family_t);
 static void	carp_master_down(void *);
 static void	carp_master_down_locked(struct carp_softc *,
     		    const char* reason);
 static void	carp_send_ad(void *);
 static void	carp_send_ad_locked(struct carp_softc *);
 static void	carp_addroute(struct carp_softc *);
 static void	carp_ifa_addroute(struct ifaddr *);
 static void	carp_delroute(struct carp_softc *);
 static void	carp_ifa_delroute(struct ifaddr *);
 static void	carp_send_ad_all(void *, int);
 static void	carp_demote_adj(int, char *);
 
 static LIST_HEAD(, carp_softc) carp_list;
 static struct mtx carp_mtx;
 static struct sx carp_sx;
 static struct task carp_sendall_task =
     TASK_INITIALIZER(0, carp_send_ad_all, NULL);
 
 static void
 carp_hmac_prepare(struct carp_softc *sc)
 {
 	uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
 	uint8_t vhid = sc->sc_vhid & 0xff;
 	struct ifaddr *ifa;
 	int i, found;
 #ifdef INET
 	struct in_addr last, cur, in;
 #endif
 #ifdef INET6
 	struct in6_addr last6, cur6, in6;
 #endif
 
 	CARP_LOCK_ASSERT(sc);
 
 	/* Compute ipad from key. */
 	bzero(sc->sc_pad, sizeof(sc->sc_pad));
 	bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
 	for (i = 0; i < sizeof(sc->sc_pad); i++)
 		sc->sc_pad[i] ^= 0x36;
 
 	/* Precompute first part of inner hash. */
 	SHA1Init(&sc->sc_sha1);
 	SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
 	SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
 	SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
 	SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
 #ifdef INET
 	cur.s_addr = 0;
 	do {
 		found = 0;
 		last = cur;
 		cur.s_addr = 0xffffffff;
 		CARP_FOREACH_IFA(sc, ifa) {
 			in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
 			if (ifa->ifa_addr->sa_family == AF_INET &&
 			    ntohl(in.s_addr) > ntohl(last.s_addr) &&
 			    ntohl(in.s_addr) < ntohl(cur.s_addr)) {
 				cur.s_addr = in.s_addr;
 				found++;
 			}
 		}
 		if (found)
 			SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
 	} while (found);
 #endif /* INET */
 #ifdef INET6
 	memset(&cur6, 0, sizeof(cur6));
 	do {
 		found = 0;
 		last6 = cur6;
 		memset(&cur6, 0xff, sizeof(cur6));
 		CARP_FOREACH_IFA(sc, ifa) {
 			in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
 			if (IN6_IS_SCOPE_EMBED(&in6))
 				in6.s6_addr16[1] = 0;
 			if (ifa->ifa_addr->sa_family == AF_INET6 &&
 			    memcmp(&in6, &last6, sizeof(in6)) > 0 &&
 			    memcmp(&in6, &cur6, sizeof(in6)) < 0) {
 				cur6 = in6;
 				found++;
 			}
 		}
 		if (found)
 			SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
 	} while (found);
 #endif /* INET6 */
 
 	/* convert ipad to opad */
 	for (i = 0; i < sizeof(sc->sc_pad); i++)
 		sc->sc_pad[i] ^= 0x36 ^ 0x5c;
 }
 
 static void
 carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2],
     unsigned char md[20])
 {
 	SHA1_CTX sha1ctx;
 
 	CARP_LOCK_ASSERT(sc);
 
 	/* fetch first half of inner hash */
 	bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
 
 	SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
 	SHA1Final(md, &sha1ctx);
 
 	/* outer hash */
 	SHA1Init(&sha1ctx);
 	SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
 	SHA1Update(&sha1ctx, md, 20);
 	SHA1Final(md, &sha1ctx);
 }
 
 static int
 carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2],
     unsigned char md[20])
 {
 	unsigned char md2[20];
 
 	CARP_LOCK_ASSERT(sc);
 
 	carp_hmac_generate(sc, counter, md2);
 
 	return (bcmp(md, md2, sizeof(md2)));
 }
 
 /*
  * process input packet.
  * we have rearranged checks order compared to the rfc,
  * but it seems more efficient this way or not possible otherwise.
  */
 #ifdef INET
 static int
 carp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct carp_header *ch;
 	int iplen, len;
 
 	iplen = *offp;
 	*mp = NULL;
 
 	CARPSTATS_INC(carps_ipackets);
 
 	if (!V_carp_allow) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that the IP TTL is 255.  */
 	if (ip->ip_ttl != CARP_DFLTTL) {
 		CARPSTATS_INC(carps_badttl);
 		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
 		    ip->ip_ttl,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	iplen = ip->ip_hl << 2;
 
 	if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
 		CARPSTATS_INC(carps_badlen);
 		CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) "
 		    "on %s\n", __func__, m->m_len - sizeof(struct ip),
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (iplen + sizeof(*ch) < m->m_len) {
 		if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
 			CARPSTATS_INC(carps_hdrops);
 			CARP_DEBUG("%s: pullup failed\n", __func__);
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 	}
 	ch = (struct carp_header *)((char *)ip + iplen);
 
 	/*
 	 * verify that the received packet length is
 	 * equal to the CARP header
 	 */
 	len = iplen + sizeof(*ch);
 	if (len > m->m_pkthdr.len) {
 		CARPSTATS_INC(carps_badlen);
 		CARP_DEBUG("%s: packet too short %d on %s\n", __func__,
 		    m->m_pkthdr.len,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if ((m = m_pullup(m, len)) == NULL) {
 		CARPSTATS_INC(carps_hdrops);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 	ch = (struct carp_header *)((char *)ip + iplen);
 
 	/* verify the CARP checksum */
 	m->m_data += iplen;
 	if (in_cksum(m, len - iplen)) {
 		CARPSTATS_INC(carps_badsum);
 		CARP_DEBUG("%s: checksum failed on %s\n", __func__,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= iplen;
 
 	carp_input_c(m, ch, AF_INET);
 	return (IPPROTO_DONE);
 }
 #endif
 
 #ifdef INET6
 static int
 carp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct carp_header *ch;
 	u_int len;
 
 	CARPSTATS_INC(carps_ipackets6);
 
 	if (!V_carp_allow) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* check if received on a valid carp interface */
 	if (m->m_pkthdr.rcvif->if_carp == NULL) {
 		CARPSTATS_INC(carps_badif);
 		CARP_DEBUG("%s: packet received on non-carp interface: %s\n",
 		    __func__, m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that the IP TTL is 255 */
 	if (ip6->ip6_hlim != CARP_DFLTTL) {
 		CARPSTATS_INC(carps_badttl);
 		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
 		    ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/* verify that we have a complete carp packet */
 	if (m->m_len < *offp + sizeof(*ch)) {
 		len = m->m_len;
 		m = m_pullup(m, *offp + sizeof(*ch));
 		if (m == NULL) {
 			CARPSTATS_INC(carps_badlen);
 			CARP_DEBUG("%s: packet size %u too small\n", __func__, len);
 			return (IPPROTO_DONE);
 		}
 	}
 	ch = (struct carp_header *)(mtod(m, char *) + *offp);
 
 	/* verify the CARP checksum */
 	m->m_data += *offp;
 	if (in_cksum(m, sizeof(*ch))) {
 		CARPSTATS_INC(carps_badsum);
 		CARP_DEBUG("%s: checksum failed, on %s\n", __func__,
 		    m->m_pkthdr.rcvif->if_xname);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	m->m_data -= *offp;
 
 	carp_input_c(m, ch, AF_INET6);
 	return (IPPROTO_DONE);
 }
 #endif /* INET6 */
 
 /*
  * This routine should not be necessary at all, but some switches
  * (VMWare ESX vswitches) can echo our own packets back at us,
  * and we must ignore them or they will cause us to drop out of
  * MASTER mode.
  *
  * We cannot catch all cases of network loops.  Instead, what we
  * do here is catch any packet that arrives with a carp header
  * with a VHID of 0, that comes from an address that is our own.
  * These packets are by definition "from us" (even if they are from
  * a misconfigured host that is pretending to be us).
  *
  * The VHID test is outside this mini-function.
  */
 static int
 carp_source_is_self(struct mbuf *m, struct ifaddr *ifa, sa_family_t af)
 {
 #ifdef INET
 	struct ip *ip4;
 	struct in_addr in4;
 #endif
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	struct in6_addr in6;
 #endif
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		ip4 = mtod(m, struct ip *);
 		in4 = ifatoia(ifa)->ia_addr.sin_addr;
 		return (in4.s_addr == ip4->ip_src.s_addr);
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ip6 = mtod(m, struct ip6_hdr *);
 		in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
 		return (memcmp(&in6, &ip6->ip6_src, sizeof(in6)) == 0);
 #endif
 	default:
 		break;
 	}
 	return (0);
 }
 
 static void
 carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
 {
 	struct ifnet *ifp = m->m_pkthdr.rcvif;
 	struct ifaddr *ifa, *match;
 	struct carp_softc *sc;
 	uint64_t tmp_counter;
 	struct timeval sc_tv, ch_tv;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	/*
 	 * Verify that the VHID is valid on the receiving interface.
 	 *
 	 * There should be just one match.  If there are none
 	 * the VHID is not valid and we drop the packet.  If
 	 * there are multiple VHID matches, take just the first
 	 * one, for compatibility with previous code.  While we're
 	 * scanning, check for obvious loops in the network topology
 	 * (these should never happen, and as noted above, we may
 	 * miss real loops; this is just a double-check).
 	 */
 	error = 0;
 	match = NULL;
 	IFNET_FOREACH_IFA(ifp, ifa) {
 		if (match == NULL && ifa->ifa_carp != NULL &&
 		    ifa->ifa_addr->sa_family == af &&
 		    ifa->ifa_carp->sc_vhid == ch->carp_vhid)
 			match = ifa;
 		if (ch->carp_vhid == 0 && carp_source_is_self(m, ifa, af))
 			error = ELOOP;
 	}
 	ifa = error ? NULL : match;
 	if (ifa != NULL)
 		ifa_ref(ifa);
 
 	if (ifa == NULL) {
 		if (error == ELOOP) {
 			CARP_DEBUG("dropping looped packet on interface %s\n",
 			    ifp->if_xname);
 			CARPSTATS_INC(carps_badif);	/* ??? */
 		} else {
 			CARPSTATS_INC(carps_badvhid);
 		}
 		m_freem(m);
 		return;
 	}
 
 	/* verify the CARP version. */
 	if (ch->carp_version != CARP_VERSION) {
 		CARPSTATS_INC(carps_badver);
 		CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname,
 		    ch->carp_version);
 		ifa_free(ifa);
 		m_freem(m);
 		return;
 	}
 
 	sc = ifa->ifa_carp;
 	CARP_LOCK(sc);
 	ifa_free(ifa);
 
 	if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
 		CARPSTATS_INC(carps_badauth);
 		CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__,
 		    sc->sc_vhid, ifp->if_xname);
 		goto out;
 	}
 
 	tmp_counter = ntohl(ch->carp_counter[0]);
 	tmp_counter = tmp_counter<<32;
 	tmp_counter += ntohl(ch->carp_counter[1]);
 
 	/* XXX Replay protection goes here */
 
 	sc->sc_init_counter = 0;
 	sc->sc_counter = tmp_counter;
 
 	sc_tv.tv_sec = sc->sc_advbase;
 	sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256;
 	ch_tv.tv_sec = ch->carp_advbase;
 	ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
 
 	switch (sc->sc_state) {
 	case INIT:
 		break;
 	case MASTER:
 		/*
 		 * If we receive an advertisement from a master who's going to
 		 * be more frequent than us, go into BACKUP state.
 		 */
 		if (timevalcmp(&sc_tv, &ch_tv, >) ||
 		    timevalcmp(&sc_tv, &ch_tv, ==)) {
 			callout_stop(&sc->sc_ad_tmo);
 			carp_set_state(sc, BACKUP,
 			    "more frequent advertisement received");
 			carp_setrun(sc, 0);
 			carp_delroute(sc);
 		}
 		break;
 	case BACKUP:
 		/*
 		 * If we're pre-empting masters who advertise slower than us,
 		 * and this one claims to be slower, treat him as down.
 		 */
 		if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) {
 			carp_master_down_locked(sc,
 			    "preempting a slower master");
 			break;
 		}
 
 		/*
 		 *  If the master is going to advertise at such a low frequency
 		 *  that he's guaranteed to time out, we'd might as well just
 		 *  treat him as timed out now.
 		 */
 		sc_tv.tv_sec = sc->sc_advbase * 3;
 		if (timevalcmp(&sc_tv, &ch_tv, <)) {
 			carp_master_down_locked(sc, "master will time out");
 			break;
 		}
 
 		/*
 		 * Otherwise, we reset the counter and wait for the next
 		 * advertisement.
 		 */
 		carp_setrun(sc, af);
 		break;
 	}
 
 out:
 	CARP_UNLOCK(sc);
 	m_freem(m);
 }
 
 static int
 carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
 {
 	struct m_tag *mtag;
 
 	if (sc->sc_init_counter) {
 		/* this could also be seconds since unix epoch */
 		sc->sc_counter = arc4random();
 		sc->sc_counter = sc->sc_counter << 32;
 		sc->sc_counter += arc4random();
 	} else
 		sc->sc_counter++;
 
 	ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
 	ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
 
 	carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
 
 	/* Tag packet for carp_output */
 	if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *),
 	    M_NOWAIT)) == NULL) {
 		m_freem(m);
 		CARPSTATS_INC(carps_onomem);
 		return (ENOMEM);
 	}
 	bcopy(&sc, mtag + 1, sizeof(sc));
 	m_tag_prepend(m, mtag);
 
 	return (0);
 }
 
 /*
  * To avoid LORs and possible recursions this function shouldn't
  * be called directly, but scheduled via taskqueue.
  */
 static void
 carp_send_ad_all(void *ctx __unused, int pending __unused)
 {
 	struct carp_softc *sc;
 	struct epoch_tracker et;
 
 	NET_EPOCH_ENTER(et);
 	mtx_lock(&carp_mtx);
 	LIST_FOREACH(sc, &carp_list, sc_next)
 		if (sc->sc_state == MASTER) {
 			CARP_LOCK(sc);
 			CURVNET_SET(sc->sc_carpdev->if_vnet);
 			carp_send_ad_locked(sc);
 			CURVNET_RESTORE();
 			CARP_UNLOCK(sc);
 		}
 	mtx_unlock(&carp_mtx);
 	NET_EPOCH_EXIT(et);
 }
 
 /* Send a periodic advertisement, executed in callout context. */
 static void
 carp_send_ad(void *v)
 {
 	struct carp_softc *sc = v;
 	struct epoch_tracker et;
 
 	NET_EPOCH_ENTER(et);
 	CARP_LOCK_ASSERT(sc);
 	CURVNET_SET(sc->sc_carpdev->if_vnet);
 	carp_send_ad_locked(sc);
 	CURVNET_RESTORE();
 	CARP_UNLOCK(sc);
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 carp_send_ad_error(struct carp_softc *sc, int error)
 {
 
 	/*
 	 * We track errors and successfull sends with this logic:
 	 * - Any error resets success counter to 0.
 	 * - MAX_ERRORS triggers demotion.
 	 * - MIN_SUCCESS successes resets error counter to 0.
 	 * - MIN_SUCCESS reverts demotion, if it was triggered before.
 	 */
 	if (error) {
 		if (sc->sc_sendad_errors < INT_MAX)
 			sc->sc_sendad_errors++;
 		if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
 			static const char fmt[] = "send error %d on %s";
 			char msg[sizeof(fmt) + IFNAMSIZ];
 
 			sprintf(msg, fmt, error, sc->sc_carpdev->if_xname);
 			carp_demote_adj(V_carp_senderr_adj, msg);
 		}
 		sc->sc_sendad_success = 0;
 	} else if (sc->sc_sendad_errors > 0) {
 		if (++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) {
 			if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS) {
 				static const char fmt[] = "send ok on %s";
 				char msg[sizeof(fmt) + IFNAMSIZ];
 
 				sprintf(msg, fmt, sc->sc_carpdev->if_xname);
 				carp_demote_adj(-V_carp_senderr_adj, msg);
 			}
 			sc->sc_sendad_errors = 0;
 		}
 	}
 }
 
 /*
  * Pick the best ifaddr on the given ifp for sending CARP
  * advertisements.
  *
  * "Best" here is defined by ifa_preferred().  This function is much
  * much like ifaof_ifpforaddr() except that we just use ifa_preferred().
  *
  * (This could be simplified to return the actual address, except that
  * it has a different format in AF_INET and AF_INET6.)
  */
 static struct ifaddr *
 carp_best_ifa(int af, struct ifnet *ifp)
 {
 	struct ifaddr *ifa, *best;
 
 	NET_EPOCH_ASSERT();
 
 	if (af >= AF_MAX)
 		return (NULL);
 	best = NULL;
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family == af &&
 		    (best == NULL || ifa_preferred(best, ifa)))
 			best = ifa;
 	}
 	if (best != NULL)
 		ifa_ref(best);
 	return (best);
 }
 
 static void
 carp_send_ad_locked(struct carp_softc *sc)
 {
 	struct carp_header ch;
 	struct timeval tv;
 	struct ifaddr *ifa;
 	struct carp_header *ch_ptr;
 	struct mbuf *m;
 	int len, advskew;
 
 	NET_EPOCH_ASSERT();
 	CARP_LOCK_ASSERT(sc);
 
 	advskew = DEMOTE_ADVSKEW(sc);
 	tv.tv_sec = sc->sc_advbase;
 	tv.tv_usec = advskew * 1000000 / 256;
 
 	ch.carp_version = CARP_VERSION;
 	ch.carp_type = CARP_ADVERTISEMENT;
 	ch.carp_vhid = sc->sc_vhid;
 	ch.carp_advbase = sc->sc_advbase;
 	ch.carp_advskew = advskew;
 	ch.carp_authlen = 7;	/* XXX DEFINE */
 	ch.carp_pad1 = 0;	/* must be zero */
 	ch.carp_cksum = 0;
 
 	/* XXXGL: OpenBSD picks first ifaddr with needed family. */
 
 #ifdef INET
 	if (sc->sc_naddrs) {
 		struct ip *ip;
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			CARPSTATS_INC(carps_onomem);
 			goto resched;
 		}
 		len = sizeof(*ip) + sizeof(ch);
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = NULL;
 		m->m_len = len;
 		M_ALIGN(m, m->m_len);
 		m->m_flags |= M_MCAST;
 		ip = mtod(m, struct ip *);
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(*ip) >> 2;
 		ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET;
 		ip->ip_len = htons(len);
 		ip->ip_off = htons(IP_DF);
 		ip->ip_ttl = CARP_DFLTTL;
 		ip->ip_p = IPPROTO_CARP;
 		ip->ip_sum = 0;
 		ip_fillid(ip);
 
 		ifa = carp_best_ifa(AF_INET, sc->sc_carpdev);
 		if (ifa != NULL) {
 			ip->ip_src.s_addr =
 			    ifatoia(ifa)->ia_addr.sin_addr.s_addr;
 			ifa_free(ifa);
 		} else
 			ip->ip_src.s_addr = 0;
 		ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
 
 		ch_ptr = (struct carp_header *)(&ip[1]);
 		bcopy(&ch, ch_ptr, sizeof(ch));
 		if (carp_prepare_ad(m, sc, ch_ptr))
 			goto resched;
 
 		m->m_data += sizeof(*ip);
 		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip));
 		m->m_data -= sizeof(*ip);
 
 		CARPSTATS_INC(carps_opackets);
 
 		carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT,
 		    &sc->sc_carpdev->if_carp->cif_imo, NULL));
 	}
 #endif /* INET */
 #ifdef INET6
 	if (sc->sc_naddrs6) {
 		struct ip6_hdr *ip6;
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			CARPSTATS_INC(carps_onomem);
 			goto resched;
 		}
 		len = sizeof(*ip6) + sizeof(ch);
 		m->m_pkthdr.len = len;
 		m->m_pkthdr.rcvif = NULL;
 		m->m_len = len;
 		M_ALIGN(m, m->m_len);
 		m->m_flags |= M_MCAST;
 		ip6 = mtod(m, struct ip6_hdr *);
 		bzero(ip6, sizeof(*ip6));
 		ip6->ip6_vfc |= IPV6_VERSION;
 		/* Traffic class isn't defined in ip6 struct instead
 		 * it gets offset into flowid field */
 		ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN +
 		    IPTOS_DSCP_OFFSET));
 		ip6->ip6_hlim = CARP_DFLTTL;
 		ip6->ip6_nxt = IPPROTO_CARP;
 
 		/* set the source address */
 		ifa = carp_best_ifa(AF_INET6, sc->sc_carpdev);
 		if (ifa != NULL) {
 			bcopy(IFA_IN6(ifa), &ip6->ip6_src,
 			    sizeof(struct in6_addr));
 			ifa_free(ifa);
 		} else
 			/* This should never happen with IPv6. */
 			bzero(&ip6->ip6_src, sizeof(struct in6_addr));
 
 		/* Set the multicast destination. */
 		ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
 		ip6->ip6_dst.s6_addr8[15] = 0x12;
 		if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
 			m_freem(m);
 			CARP_DEBUG("%s: in6_setscope failed\n", __func__);
 			goto resched;
 		}
 
 		ch_ptr = (struct carp_header *)(&ip6[1]);
 		bcopy(&ch, ch_ptr, sizeof(ch));
 		if (carp_prepare_ad(m, sc, ch_ptr))
 			goto resched;
 
 		m->m_data += sizeof(*ip6);
 		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6));
 		m->m_data -= sizeof(*ip6);
 
 		CARPSTATS_INC(carps_opackets6);
 
 		carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0,
 		    &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL));
 	}
 #endif /* INET6 */
 
 resched:
 	callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc);
 }
 
 static void
 carp_addroute(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 
 	CARP_FOREACH_IFA(sc, ifa)
 		carp_ifa_addroute(ifa);
 }
 
 static void
 carp_ifa_addroute(struct ifaddr *ifa)
 {
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		in_addprefix(ifatoia(ifa));
 		ifa_add_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ifa_add_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
 		nd6_add_ifa_lle(ifatoia6(ifa));
 		break;
 #endif
 	}
 }
 
 static void
 carp_delroute(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 
 	CARP_FOREACH_IFA(sc, ifa)
 		carp_ifa_delroute(ifa);
 }
 
 static void
 carp_ifa_delroute(struct ifaddr *ifa)
 {
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		ifa_del_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
 		in_scrubprefix(ifatoia(ifa), LLE_STATIC);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ifa_del_loopback_route(ifa,
 		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
 		nd6_rem_ifa_lle(ifatoia6(ifa), 1);
 		break;
 #endif
 	}
 }
 
 int
 carp_master(struct ifaddr *ifa)
 {
 	struct carp_softc *sc = ifa->ifa_carp;
 
 	return (sc->sc_state == MASTER);
 }
 
 #ifdef INET
 /*
  * Broadcast a gratuitous ARP request containing
  * the virtual router MAC address for each IP address
  * associated with the virtual router.
  */
 static void
 carp_send_arp(struct carp_softc *sc)
 {
 	struct ifaddr *ifa;
 	struct in_addr addr;
 
 	NET_EPOCH_ASSERT();
 
 	CARP_FOREACH_IFA(sc, ifa) {
 		if (ifa->ifa_addr->sa_family != AF_INET)
 			continue;
 		addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr;
 		arp_announce_ifaddr(sc->sc_carpdev, addr, LLADDR(&sc->sc_addr));
 	}
 }
 
 int
 carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr)
 {
 	struct carp_softc *sc = ifa->ifa_carp;
 
 	if (sc->sc_state == MASTER) {
 		*enaddr = LLADDR(&sc->sc_addr);
 		return (1);
 	}
 
 	return (0);
 }
 #endif
 
 #ifdef INET6
 static void
 carp_send_na(struct carp_softc *sc)
 {
 	static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
 	struct ifaddr *ifa;
 	struct in6_addr *in6;
 
 	CARP_FOREACH_IFA(sc, ifa) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		in6 = IFA_IN6(ifa);
 		nd6_na_output(sc->sc_carpdev, &mcast, in6,
 		    ND_NA_FLAG_OVERRIDE, 1, NULL);
 		DELAY(1000);	/* XXX */
 	}
 }
 
 /*
  * Returns ifa in case it's a carp address and it is MASTER, or if the address
  * matches and is not a carp address.  Returns NULL otherwise.
  */
 struct ifaddr *
 carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr)
 {
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 
 	ifa = NULL;
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa)))
 			continue;
 		if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER)
 			ifa = NULL;
 		else
 			ifa_ref(ifa);
 		break;
 	}
 
 	return (ifa);
 }
 
 char *
 carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
 {
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 
 	IFNET_FOREACH_IFA(ifp, ifa)
 		if (ifa->ifa_addr->sa_family == AF_INET6 &&
 		    IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) {
 			struct carp_softc *sc = ifa->ifa_carp;
 			struct m_tag *mtag;
 
 			mtag = m_tag_get(PACKET_TAG_CARP,
 			    sizeof(struct carp_softc *), M_NOWAIT);
 			if (mtag == NULL)
 				/* Better a bit than nothing. */
 				return (LLADDR(&sc->sc_addr));
 
 			bcopy(&sc, mtag + 1, sizeof(sc));
 			m_tag_prepend(m, mtag);
 
 			return (LLADDR(&sc->sc_addr));
 		}
 
 	return (NULL);
 }
 #endif /* INET6 */
 
 int
 carp_forus(struct ifnet *ifp, u_char *dhost)
 {
 	struct carp_softc *sc;
 	uint8_t *ena = dhost;
 
 	if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
 		return (0);
 
 	CIF_LOCK(ifp->if_carp);
 	IFNET_FOREACH_CARP(ifp, sc) {
 		/*
 		 * CARP_LOCK() is not here, since would protect nothing, but
 		 * cause deadlock with if_bridge, calling this under its lock.
 		 */
 		if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr),
 		    ETHER_ADDR_LEN)) {
 			CIF_UNLOCK(ifp->if_carp);
 			return (1);
 		}
 	}
 	CIF_UNLOCK(ifp->if_carp);
 
 	return (0);
 }
 
 /* Master down timeout event, executed in callout context. */
 static void
 carp_master_down(void *v)
 {
 	struct carp_softc *sc = v;
 	struct epoch_tracker et;
 
 	NET_EPOCH_ENTER(et);
 	CARP_LOCK_ASSERT(sc);
 
 	CURVNET_SET(sc->sc_carpdev->if_vnet);
 	if (sc->sc_state == BACKUP) {
 		carp_master_down_locked(sc, "master timed out");
 	}
 	CURVNET_RESTORE();
 
 	CARP_UNLOCK(sc);
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 carp_master_down_locked(struct carp_softc *sc, const char *reason)
 {
 
 	NET_EPOCH_ASSERT();
 	CARP_LOCK_ASSERT(sc);
 
 	switch (sc->sc_state) {
 	case BACKUP:
 		carp_set_state(sc, MASTER, reason);
 		carp_send_ad_locked(sc);
 #ifdef INET
 		carp_send_arp(sc);
 #endif
 #ifdef INET6
 		carp_send_na(sc);
 #endif
 		carp_setrun(sc, 0);
 		carp_addroute(sc);
 		break;
 	case INIT:
 	case MASTER:
 #ifdef INVARIANTS
 		panic("carp: VHID %u@%s: master_down event in %s state\n",
 		    sc->sc_vhid,
 		    sc->sc_carpdev->if_xname,
 		    sc->sc_state ? "MASTER" : "INIT");
 #endif
 		break;
 	}
 }
 
 /*
  * When in backup state, af indicates whether to reset the master down timer
  * for v4 or v6. If it's set to zero, reset the ones which are already pending.
  */
 static void
 carp_setrun(struct carp_softc *sc, sa_family_t af)
 {
 	struct timeval tv;
 
 	CARP_LOCK_ASSERT(sc);
 
 	if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 ||
 	    sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
 	    (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) ||
 	    !V_carp_allow)
 		return;
 
 	switch (sc->sc_state) {
 	case INIT:
 		carp_set_state(sc, BACKUP, "initialization complete");
 		carp_setrun(sc, 0);
 		break;
 	case BACKUP:
 		callout_stop(&sc->sc_ad_tmo);
 		tv.tv_sec = 3 * sc->sc_advbase;
 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
 		switch (af) {
 #ifdef INET
 		case AF_INET:
 			callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
 			    carp_master_down, sc);
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
 			    carp_master_down, sc);
 			break;
 #endif
 		default:
 #ifdef INET
 			if (sc->sc_naddrs)
 				callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
 				    carp_master_down, sc);
 #endif
 #ifdef INET6
 			if (sc->sc_naddrs6)
 				callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
 				    carp_master_down, sc);
 #endif
 			break;
 		}
 		break;
 	case MASTER:
 		tv.tv_sec = sc->sc_advbase;
 		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
 		callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
 		    carp_send_ad, sc);
 		break;
 	}
 }
 
 /*
  * Setup multicast structures.
  */
 static int
 carp_multicast_setup(struct carp_if *cif, sa_family_t sa)
 {
 	struct ifnet *ifp = cif->cif_ifp;
 	int error = 0;
 
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
 	    {
 		struct ip_moptions *imo = &cif->cif_imo;
 		struct in_mfilter *imf;
 		struct in_addr addr;
 
 		if (ip_mfilter_first(&imo->imo_head) != NULL)
 			return (0);
 
 		imf = ip_mfilter_alloc(M_WAITOK, 0, 0);
 		ip_mfilter_init(&imo->imo_head);
 		imo->imo_multicast_vif = -1;
 
 		addr.s_addr = htonl(INADDR_CARP_GROUP);
 		if ((error = in_joingroup(ifp, &addr, NULL,
 		    &imf->imf_inm)) != 0) {
 			ip_mfilter_free(imf);
 			break;
 		}
 
 		ip_mfilter_insert(&imo->imo_head, imf);
 		imo->imo_multicast_ifp = ifp;
 		imo->imo_multicast_ttl = CARP_DFLTTL;
 		imo->imo_multicast_loop = 0;
 		break;
 	   }
 #endif
 #ifdef INET6
 	case AF_INET6:
 	    {
 		struct ip6_moptions *im6o = &cif->cif_im6o;
 		struct in6_mfilter *im6f[2];
 		struct in6_addr in6;
 
 		if (ip6_mfilter_first(&im6o->im6o_head))
 			return (0);
 
 		im6f[0] = ip6_mfilter_alloc(M_WAITOK, 0, 0);
 		im6f[1] = ip6_mfilter_alloc(M_WAITOK, 0, 0);
 
 		ip6_mfilter_init(&im6o->im6o_head);
 		im6o->im6o_multicast_hlim = CARP_DFLTTL;
 		im6o->im6o_multicast_ifp = ifp;
 
 		/* Join IPv6 CARP multicast group. */
 		bzero(&in6, sizeof(in6));
 		in6.s6_addr16[0] = htons(0xff02);
 		in6.s6_addr8[15] = 0x12;
 		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
 			ip6_mfilter_free(im6f[0]);
 			ip6_mfilter_free(im6f[1]);
 			break;
 		}
 		if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[0]->im6f_in6m, 0)) != 0) {
 			ip6_mfilter_free(im6f[0]);
 			ip6_mfilter_free(im6f[1]);
 			break;
 		}
 
 		/* Join solicited multicast address. */
 		bzero(&in6, sizeof(in6));
 		in6.s6_addr16[0] = htons(0xff02);
 		in6.s6_addr32[1] = 0;
 		in6.s6_addr32[2] = htonl(1);
 		in6.s6_addr32[3] = 0;
 		in6.s6_addr8[12] = 0xff;
 
 		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
 			ip6_mfilter_free(im6f[0]);
 			ip6_mfilter_free(im6f[1]);
 			break;
 		}
 
 		if ((error = in6_joingroup(ifp, &in6, NULL, &im6f[1]->im6f_in6m, 0)) != 0) {
 			in6_leavegroup(im6f[0]->im6f_in6m, NULL);
 			ip6_mfilter_free(im6f[0]);
 			ip6_mfilter_free(im6f[1]);
 			break;
 		}
 		ip6_mfilter_insert(&im6o->im6o_head, im6f[0]);
 		ip6_mfilter_insert(&im6o->im6o_head, im6f[1]);
 		break;
 	    }
 #endif
 	}
 
 	return (error);
 }
 
 /*
  * Free multicast structures.
  */
 static void
 carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa)
 {
 #ifdef INET
 	struct ip_moptions *imo = &cif->cif_imo;
 	struct in_mfilter *imf;
 #endif
 #ifdef INET6
 	struct ip6_moptions *im6o = &cif->cif_im6o;
 	struct in6_mfilter *im6f;
 #endif
 	sx_assert(&carp_sx, SA_XLOCKED);
 
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
 		if (cif->cif_naddrs != 0)
 			break;
 
 		while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) {
 			ip_mfilter_remove(&imo->imo_head, imf);
 			in_leavegroup(imf->imf_inm, NULL);
 			ip_mfilter_free(imf);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (cif->cif_naddrs6 != 0)
 			break;
 
 		while ((im6f = ip6_mfilter_first(&im6o->im6o_head)) != NULL) {
 			ip6_mfilter_remove(&im6o->im6o_head, im6f);
 			in6_leavegroup(im6f->im6f_in6m, NULL);
 			ip6_mfilter_free(im6f);
 		}
 		break;
 #endif
 	}
 }
 
 int
 carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa)
 {
 	struct m_tag *mtag;
 	struct carp_softc *sc;
 
 	if (!sa)
 		return (0);
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		break;
 #endif
 	default:
 		return (0);
 	}
 
 	mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
 	if (mtag == NULL)
 		return (0);
 
 	bcopy(mtag + 1, &sc, sizeof(sc));
 
 	/* Set the source MAC address to the Virtual Router MAC Address. */
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_BRIDGE:
 	case IFT_L2VLAN: {
 			struct ether_header *eh;
 
 			eh = mtod(m, struct ether_header *);
 			eh->ether_shost[0] = 0;
 			eh->ether_shost[1] = 0;
 			eh->ether_shost[2] = 0x5e;
 			eh->ether_shost[3] = 0;
 			eh->ether_shost[4] = 1;
 			eh->ether_shost[5] = sc->sc_vhid;
 		}
 		break;
 	default:
 		printf("%s: carp is not supported for the %d interface type\n",
 		    ifp->if_xname, ifp->if_type);
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static struct carp_softc*
 carp_alloc(struct ifnet *ifp)
 {
 	struct carp_softc *sc;
 	struct carp_if *cif;
 
 	sx_assert(&carp_sx, SA_XLOCKED);
 
 	if ((cif = ifp->if_carp) == NULL)
 		cif = carp_alloc_if(ifp);
 
 	sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
 
 	sc->sc_advbase = CARP_DFLTINTV;
 	sc->sc_vhid = -1;	/* required setting */
 	sc->sc_init_counter = 1;
 	sc->sc_state = INIT;
 
 	sc->sc_ifasiz = sizeof(struct ifaddr *);
 	sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO);
 	sc->sc_carpdev = ifp;
 
 	CARP_LOCK_INIT(sc);
 #ifdef INET
 	callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 #endif
 #ifdef INET6
 	callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 #endif
 	callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
 
 	CIF_LOCK(cif);
 	TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list);
 	CIF_UNLOCK(cif);
 
 	mtx_lock(&carp_mtx);
 	LIST_INSERT_HEAD(&carp_list, sc, sc_next);
 	mtx_unlock(&carp_mtx);
 
 	return (sc);
 }
 
 static void
 carp_grow_ifas(struct carp_softc *sc)
 {
 	struct ifaddr **new;
 
 	new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO);
 	CARP_LOCK(sc);
 	bcopy(sc->sc_ifas, new, sc->sc_ifasiz);
 	free(sc->sc_ifas, M_CARP);
 	sc->sc_ifas = new;
 	sc->sc_ifasiz *= 2;
 	CARP_UNLOCK(sc);
 }
 
 static void
 carp_destroy(struct carp_softc *sc)
 {
 	struct ifnet *ifp = sc->sc_carpdev;
 	struct carp_if *cif = ifp->if_carp;
 
 	sx_assert(&carp_sx, SA_XLOCKED);
 
 	if (sc->sc_suppress)
 		carp_demote_adj(-V_carp_ifdown_adj, "vhid removed");
 	CARP_UNLOCK(sc);
 
 	CIF_LOCK(cif);
 	TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list);
 	CIF_UNLOCK(cif);
 
 	mtx_lock(&carp_mtx);
 	LIST_REMOVE(sc, sc_next);
 	mtx_unlock(&carp_mtx);
 
 	callout_drain(&sc->sc_ad_tmo);
 #ifdef INET
 	callout_drain(&sc->sc_md_tmo);
 #endif
 #ifdef INET6
 	callout_drain(&sc->sc_md6_tmo);
 #endif
 	CARP_LOCK_DESTROY(sc);
 
 	free(sc->sc_ifas, M_CARP);
 	free(sc, M_CARP);
 }
 
 static struct carp_if*
 carp_alloc_if(struct ifnet *ifp)
 {
 	struct carp_if *cif;
 	int error;
 
 	cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO);
 
 	if ((error = ifpromisc(ifp, 1)) != 0)
 		printf("%s: ifpromisc(%s) failed: %d\n",
 		    __func__, ifp->if_xname, error);
 	else
 		cif->cif_flags |= CIF_PROMISC;
 
 	CIF_LOCK_INIT(cif);
 	cif->cif_ifp = ifp;
 	TAILQ_INIT(&cif->cif_vrs);
 
 	IF_ADDR_WLOCK(ifp);
 	ifp->if_carp = cif;
 	if_ref(ifp);
 	IF_ADDR_WUNLOCK(ifp);
 
 	return (cif);
 }
 
 static void
 carp_free_if(struct carp_if *cif)
 {
 	struct ifnet *ifp = cif->cif_ifp;
 
 	CIF_LOCK_ASSERT(cif);
 	KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty",
 	    __func__));
 
 	IF_ADDR_WLOCK(ifp);
 	ifp->if_carp = NULL;
 	IF_ADDR_WUNLOCK(ifp);
 
 	CIF_LOCK_DESTROY(cif);
 
 	if (cif->cif_flags & CIF_PROMISC)
 		ifpromisc(ifp, 0);
 	if_rele(ifp);
 
 	free(cif, M_CARP);
 }
 
 static void
 carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv)
 {
 
 	CARP_LOCK(sc);
 	carpr->carpr_state = sc->sc_state;
 	carpr->carpr_vhid = sc->sc_vhid;
 	carpr->carpr_advbase = sc->sc_advbase;
 	carpr->carpr_advskew = sc->sc_advskew;
 	if (priv)
 		bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key));
 	else
 		bzero(carpr->carpr_key, sizeof(carpr->carpr_key));
 	CARP_UNLOCK(sc);
 }
 
 int
 carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td)
 {
 	struct carpreq carpr;
 	struct ifnet *ifp;
 	struct carp_softc *sc = NULL;
 	int error = 0, locked = 0;
 
 	if ((error = copyin(ifr_data_get_ptr(ifr), &carpr, sizeof carpr)))
 		return (error);
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL)
 		return (ENXIO);
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 		break;
 	default:
 		error = EOPNOTSUPP;
 		goto out;
 	}
 
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 		error = EADDRNOTAVAIL;
 		goto out;
 	}
 
 	sx_xlock(&carp_sx);
 	switch (cmd) {
 	case SIOCSVH:
 		if ((error = priv_check(td, PRIV_NETINET_CARP)))
 			break;
 		if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID ||
 		    carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) {
 			error = EINVAL;
 			break;
 		}
 
 		if (ifp->if_carp) {
 			IFNET_FOREACH_CARP(ifp, sc)
 				if (sc->sc_vhid == carpr.carpr_vhid)
 					break;
 		}
 		if (sc == NULL) {
 			sc = carp_alloc(ifp);
 			CARP_LOCK(sc);
 			sc->sc_vhid = carpr.carpr_vhid;
 			LLADDR(&sc->sc_addr)[0] = 0;
 			LLADDR(&sc->sc_addr)[1] = 0;
 			LLADDR(&sc->sc_addr)[2] = 0x5e;
 			LLADDR(&sc->sc_addr)[3] = 0;
 			LLADDR(&sc->sc_addr)[4] = 1;
 			LLADDR(&sc->sc_addr)[5] = sc->sc_vhid;
 		} else
 			CARP_LOCK(sc);
 		locked = 1;
 		if (carpr.carpr_advbase > 0) {
 			if (carpr.carpr_advbase > 255 ||
 			    carpr.carpr_advbase < CARP_DFLTINTV) {
 				error = EINVAL;
 				break;
 			}
 			sc->sc_advbase = carpr.carpr_advbase;
 		}
 		if (carpr.carpr_advskew >= 255) {
 			error = EINVAL;
 			break;
 		}
 		sc->sc_advskew = carpr.carpr_advskew;
 		if (carpr.carpr_key[0] != '\0') {
 			bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
 			carp_hmac_prepare(sc);
 		}
 		if (sc->sc_state != INIT &&
 		    carpr.carpr_state != sc->sc_state) {
 			switch (carpr.carpr_state) {
 			case BACKUP:
 				callout_stop(&sc->sc_ad_tmo);
 				carp_set_state(sc, BACKUP,
 				    "user requested via ifconfig");
 				carp_setrun(sc, 0);
 				carp_delroute(sc);
 				break;
 			case MASTER:
 				carp_master_down_locked(sc,
 				    "user requested via ifconfig");
 				break;
 			default:
 				break;
 			}
 		}
 		break;
 
 	case SIOCGVH:
 	    {
 		int priveleged;
 
 		if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) {
 			error = EINVAL;
 			break;
 		}
 		if (carpr.carpr_count < 1) {
 			error = EMSGSIZE;
 			break;
 		}
 		if (ifp->if_carp == NULL) {
 			error = ENOENT;
 			break;
 		}
 
 		priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0);
 		if (carpr.carpr_vhid != 0) {
 			IFNET_FOREACH_CARP(ifp, sc)
 				if (sc->sc_vhid == carpr.carpr_vhid)
 					break;
 			if (sc == NULL) {
 				error = ENOENT;
 				break;
 			}
 			carp_carprcp(&carpr, sc, priveleged);
 			error = copyout(&carpr, ifr_data_get_ptr(ifr),
 			    sizeof(carpr));
 		} else  {
 			int i, count;
 
 			count = 0;
 			IFNET_FOREACH_CARP(ifp, sc)
 				count++;
 
 			if (count > carpr.carpr_count) {
 				CIF_UNLOCK(ifp->if_carp);
 				error = EMSGSIZE;
 				break;
 			}
 
 			i = 0;
 			IFNET_FOREACH_CARP(ifp, sc) {
 				carp_carprcp(&carpr, sc, priveleged);
 				carpr.carpr_count = count;
 				error = copyout(&carpr,
 				    (char *)ifr_data_get_ptr(ifr) +
 				    (i * sizeof(carpr)), sizeof(carpr));
 				if (error) {
 					CIF_UNLOCK(ifp->if_carp);
 					break;
 				}
 				i++;
 			}
 		}
 		break;
 	    }
 	default:
 		error = EINVAL;
 	}
 	sx_xunlock(&carp_sx);
 
 out:
 	if (locked)
 		CARP_UNLOCK(sc);
 	if_rele(ifp);
 
 	return (error);
 }
 
 static int
 carp_get_vhid(struct ifaddr *ifa)
 {
 
 	if (ifa == NULL || ifa->ifa_carp == NULL)
 		return (0);
 
 	return (ifa->ifa_carp->sc_vhid);
 }
 
 int
 carp_attach(struct ifaddr *ifa, int vhid)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct carp_if *cif = ifp->if_carp;
 	struct carp_softc *sc;
 	int index, error;
 
 	KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa));
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 #endif
 #ifdef INET6
 	case AF_INET6:
 #endif
 		break;
 	default:
 		return (EPROTOTYPE);
 	}
 
 	sx_xlock(&carp_sx);
 	if (ifp->if_carp == NULL) {
 		sx_xunlock(&carp_sx);
 		return (ENOPROTOOPT);
 	}
 
 	IFNET_FOREACH_CARP(ifp, sc)
 		if (sc->sc_vhid == vhid)
 			break;
 	if (sc == NULL) {
 		sx_xunlock(&carp_sx);
 		return (ENOENT);
 	}
 
 	error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family);
 	if (error) {
 		CIF_FREE(cif);
 		sx_xunlock(&carp_sx);
 		return (error);
 	}
 
 	index = sc->sc_naddrs + sc->sc_naddrs6 + 1;
 	if (index > sc->sc_ifasiz / sizeof(struct ifaddr *))
 		carp_grow_ifas(sc);
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		cif->cif_naddrs++;
 		sc->sc_naddrs++;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		cif->cif_naddrs6++;
 		sc->sc_naddrs6++;
 		break;
 #endif
 	}
 
 	ifa_ref(ifa);
 
 	CARP_LOCK(sc);
 	sc->sc_ifas[index - 1] = ifa;
 	ifa->ifa_carp = sc;
 	carp_hmac_prepare(sc);
 	carp_sc_state(sc);
 	CARP_UNLOCK(sc);
 
 	sx_xunlock(&carp_sx);
 
 	return (0);
 }
 
 void
 carp_detach(struct ifaddr *ifa, bool keep_cif)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct carp_if *cif = ifp->if_carp;
 	struct carp_softc *sc = ifa->ifa_carp;
 	int i, index;
 
 	KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa));
 
 	sx_xlock(&carp_sx);
 
 	CARP_LOCK(sc);
 	/* Shift array. */
 	index = sc->sc_naddrs + sc->sc_naddrs6;
 	for (i = 0; i < index; i++)
 		if (sc->sc_ifas[i] == ifa)
 			break;
 	KASSERT(i < index, ("%s: %p no backref", __func__, ifa));
 	for (; i < index - 1; i++)
 		sc->sc_ifas[i] = sc->sc_ifas[i+1];
 	sc->sc_ifas[index - 1] = NULL;
 
 	switch (ifa->ifa_addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		cif->cif_naddrs--;
 		sc->sc_naddrs--;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		cif->cif_naddrs6--;
 		sc->sc_naddrs6--;
 		break;
 #endif
 	}
 
 	carp_ifa_delroute(ifa);
 	carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family);
 
 	ifa->ifa_carp = NULL;
 	ifa_free(ifa);
 
 	carp_hmac_prepare(sc);
 	carp_sc_state(sc);
 
 	if (!keep_cif && sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0)
 		carp_destroy(sc);
 	else
 		CARP_UNLOCK(sc);
 
 	if (!keep_cif)
 		CIF_FREE(cif);
 
 	sx_xunlock(&carp_sx);
 }
 
 static void
 carp_set_state(struct carp_softc *sc, int state, const char *reason)
 {
 
 	CARP_LOCK_ASSERT(sc);
 
 	if (sc->sc_state != state) {
 		const char *carp_states[] = { CARP_STATES };
 		char subsys[IFNAMSIZ+5];
 
 		snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid,
 		    sc->sc_carpdev->if_xname);
 
 		CARP_LOG("%s: %s -> %s (%s)\n", subsys,
 		    carp_states[sc->sc_state], carp_states[state], reason);
 
 		sc->sc_state = state;
 
 		devctl_notify("CARP", subsys, carp_states[state], NULL);
 	}
 }
 
 static void
 carp_linkstate(struct ifnet *ifp)
 {
 	struct carp_softc *sc;
 
 	CIF_LOCK(ifp->if_carp);
 	IFNET_FOREACH_CARP(ifp, sc) {
 		CARP_LOCK(sc);
 		carp_sc_state(sc);
 		CARP_UNLOCK(sc);
 	}
 	CIF_UNLOCK(ifp->if_carp);
 }
 
 static void
 carp_sc_state(struct carp_softc *sc)
 {
 
 	CARP_LOCK_ASSERT(sc);
 
 	if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
 	    !(sc->sc_carpdev->if_flags & IFF_UP) ||
 	    !V_carp_allow) {
 		callout_stop(&sc->sc_ad_tmo);
 #ifdef INET
 		callout_stop(&sc->sc_md_tmo);
 #endif
 #ifdef INET6
 		callout_stop(&sc->sc_md6_tmo);
 #endif
 		carp_set_state(sc, INIT, "hardware interface down");
 		carp_setrun(sc, 0);
 		if (!sc->sc_suppress)
 			carp_demote_adj(V_carp_ifdown_adj, "interface down");
 		sc->sc_suppress = 1;
 	} else {
 		carp_set_state(sc, INIT, "hardware interface up");
 		carp_setrun(sc, 0);
 		if (sc->sc_suppress)
 			carp_demote_adj(-V_carp_ifdown_adj, "interface up");
 		sc->sc_suppress = 0;
 	}
 }
 
 static void
 carp_demote_adj(int adj, char *reason)
 {
 	atomic_add_int(&V_carp_demotion, adj);
 	CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason);
 	taskqueue_enqueue(taskqueue_swi, &carp_sendall_task);
 }
 
 static int
 carp_allow_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int new, error;
 	struct carp_softc *sc;
 
 	new = V_carp_allow;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (V_carp_allow != new) {
 		V_carp_allow = new;
 
 		mtx_lock(&carp_mtx);
 		LIST_FOREACH(sc, &carp_list, sc_next) {
 			CARP_LOCK(sc);
 			if (curvnet == sc->sc_carpdev->if_vnet)
 				carp_sc_state(sc);
 			CARP_UNLOCK(sc);
 		}
 		mtx_unlock(&carp_mtx);
 	}
 
 	return (0);
 }
 
 static int
 carp_dscp_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int new, error;
 
 	new = V_carp_dscp;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (new < 0 || new > 63)
 		return (EINVAL);
 
 	V_carp_dscp = new;
 
 	return (0);
 }
 
 static int
 carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int new, error;
 
 	new = V_carp_demotion;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	carp_demote_adj(new, "sysctl");
 
 	return (0);
 }
 
 static void
 carp_mod_cleanup(void)
 {
 
 #ifdef INET
 	(void)ipproto_unregister(IPPROTO_CARP);
 	carp_iamatch_p = NULL;
 #endif
 #ifdef INET6
 	(void)ip6proto_unregister(IPPROTO_CARP);
 	carp_iamatch6_p = NULL;
 	carp_macmatch6_p = NULL;
 #endif
 	carp_ioctl_p = NULL;
 	carp_attach_p = NULL;
 	carp_detach_p = NULL;
 	carp_get_vhid_p = NULL;
 	carp_linkstate_p = NULL;
 	carp_forus_p = NULL;
 	carp_output_p = NULL;
 	carp_demote_adj_p = NULL;
 	carp_master_p = NULL;
 	mtx_unlock(&carp_mtx);
 	taskqueue_drain(taskqueue_swi, &carp_sendall_task);
 	mtx_destroy(&carp_mtx);
 	sx_destroy(&carp_sx);
 }
 
 static void
 ipcarp_sysinit(void)
 {
 
 	/* Load allow as tunable so to postpone carp start after module load */
 	TUNABLE_INT_FETCH("net.inet.carp.allow", &V_carp_allow);
 }
 VNET_SYSINIT(ip_carp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, ipcarp_sysinit, NULL);
 
 static int
 carp_mod_load(void)
 {
 	int err;
 
 	mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
 	sx_init(&carp_sx, "carp_sx");
 	LIST_INIT(&carp_list);
 	carp_get_vhid_p = carp_get_vhid;
 	carp_forus_p = carp_forus;
 	carp_output_p = carp_output;
 	carp_linkstate_p = carp_linkstate;
 	carp_ioctl_p = carp_ioctl;
 	carp_attach_p = carp_attach;
 	carp_detach_p = carp_detach;
 	carp_demote_adj_p = carp_demote_adj;
 	carp_master_p = carp_master;
 #ifdef INET6
 	carp_iamatch6_p = carp_iamatch6;
 	carp_macmatch6_p = carp_macmatch6;
 	err = ip6proto_register(IPPROTO_CARP, carp6_input, NULL);
 	if (err) {
 		printf("carp: error %d registering with INET6\n", err);
 		carp_mod_cleanup();
 		return (err);
 	}
 #endif
 #ifdef INET
 	carp_iamatch_p = carp_iamatch;
 	err = ipproto_register(IPPROTO_CARP, carp_input, NULL);
 	if (err) {
 		printf("carp: error %d registering with INET\n", err);
 		carp_mod_cleanup();
 		return (err);
 	}
 #endif
 	return (0);
 }
 
 static int
 carp_modevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 		return carp_mod_load();
 		/* NOTREACHED */
 	case MOD_UNLOAD:
 		mtx_lock(&carp_mtx);
 		if (LIST_EMPTY(&carp_list))
 			carp_mod_cleanup();
 		else {
 			mtx_unlock(&carp_mtx);
 			return (EBUSY);
 		}
 		break;
 
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static moduledata_t carp_mod = {
 	"carp",
 	carp_modevent,
 	0
 };
 
 DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
index dd76a1792325..e570418ebc8e 100644
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -1,766 +1,767 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <net/vnet.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/netisr.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_divert.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 /*
  * Divert sockets
  */
 
 /*
  * Allocate enough space to hold a full IP packet
  */
 #define	DIVSNDQ		(65536 + 100)
 #define	DIVRCVQ		(65536 + 100)
 
 /*
  * Usually a system has very few divert ports.  Previous implementation
  * used a linked list.
  */
 #define	DIVHASHSIZE	(1 << 3)	/* 8 entries, one cache line. */
 #define	DIVHASH(port)	(port % DIVHASHSIZE)
 #define	DCBHASH(dcb)	((dcb)->dcb_port % DIVHASHSIZE)
 
 /*
  * Divert sockets work in conjunction with ipfw or other packet filters,
  * see the divert(4) manpage for features.
  * Packets are selected by the packet filter and tagged with an
  * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by
  * the packet filter) and information on the matching filter rule for
  * subsequent reinjection. The divert_port is used to put the packet
  * on the corresponding divert socket, while the rule number is passed
  * up (at least partially) as the sin_port in the struct sockaddr.
  *
  * Packets written to the divert socket carry in sin_addr a
  * destination address, and in sin_port the number of the filter rule
  * after which to continue processing.
  * If the destination address is INADDR_ANY, the packet is treated as
  * as outgoing and sent to ip_output(); otherwise it is treated as
  * incoming and sent to ip_input().
  * Further, sin_zero carries some information on the interface,
  * which can be used in the reinject -- see comments in the code.
  *
  * On reinjection, processing in ip_input() and ip_output()
  * will be exactly the same as for the original packet, except that
  * packet filter processing will start at the rule number after the one
  * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0
  * will apply the entire ruleset to the packet).
  */
 static SYSCTL_NODE(_net_inet, OID_AUTO, divert, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "divert(4)");
 
 VNET_PCPUSTAT_DEFINE_STATIC(struct divstat, divstat);
 VNET_PCPUSTAT_SYSINIT(divstat);
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(divstat);
 #endif
 SYSCTL_VNET_PCPUSTAT(_net_inet_divert, OID_AUTO, stats, struct divstat,
     divstat, "divert(4) socket statistics");
 #define	DIVSTAT_INC(name)	\
     VNET_PCPUSTAT_ADD(struct divstat, divstat, div_ ## name, 1)
 
 static u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
 static u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
 
 static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m,
     struct sockaddr_in *sin);
 static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
 
 struct divcb {
 	union {
 		SLIST_ENTRY(divcb)	dcb_next;
 		intptr_t		dcb_bound;
 #define	DCB_UNBOUND	((intptr_t)-1)
 	};
 	struct socket		*dcb_socket;
 	uint16_t		 dcb_port;
 	uint64_t		 dcb_gencnt;
 	struct epoch_context	 dcb_epochctx;
 };
 
 SLIST_HEAD(divhashhead, divcb);
 
 VNET_DEFINE_STATIC(struct divhashhead, divhash[DIVHASHSIZE]) = {};
 #define	V_divhash	VNET(divhash)
 VNET_DEFINE_STATIC(uint64_t, dcb_count) = 0;
 #define	V_dcb_count	VNET(dcb_count)
 VNET_DEFINE_STATIC(uint64_t, dcb_gencnt) = 0;
 #define	V_dcb_gencnt	VNET(dcb_gencnt)
 
 static struct mtx divert_mtx;
 MTX_SYSINIT(divert, &divert_mtx, "divert(4) socket pcb lists", MTX_DEF);
 #define	DIVERT_LOCK()	mtx_lock(&divert_mtx)
 #define	DIVERT_UNLOCK()	mtx_unlock(&divert_mtx)
 
 /*
  * Divert a packet by passing it up to the divert socket at port 'port'.
  */
 static void
 divert_packet(struct mbuf *m, bool incoming)
 {
 	struct divcb *dcb;
 	u_int16_t nport;
 	struct sockaddr_in divsrc;
 	struct m_tag *mtag;
 
 	NET_EPOCH_ASSERT();
 
 	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
 	if (mtag == NULL) {
 		m_freem(m);
 		return;
 	}
 	/* Assure header */
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == NULL)
 		return;
 #ifdef INET
 	/* Delayed checksums are currently not compatible with divert. */
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP) {
 		struct ip *ip;
 
 		ip = mtod(m, struct ip *);
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 #endif
 #ifdef INET6
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 		in6_delayed_cksum(m, m->m_pkthdr.len -
 		    sizeof(struct ip6_hdr), sizeof(struct ip6_hdr));
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
 		sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 	}
 #endif
 #endif /* INET6 */
 	bzero(&divsrc, sizeof(divsrc));
 	divsrc.sin_len = sizeof(divsrc);
 	divsrc.sin_family = AF_INET;
 	/* record matching rule, in host format */
 	divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum;
 	/*
 	 * Record receive interface address, if any.
 	 * But only for incoming packets.
 	 */
 	if (incoming) {
 		struct ifaddr *ifa;
 		struct ifnet *ifp;
 
 		/* Sanity check */
 		M_ASSERTPKTHDR(m);
 
 		/* Find IP address for receive interface */
 		ifp = m->m_pkthdr.rcvif;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			divsrc.sin_addr =
 			    ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
 			break;
 		}
 	}
 	/*
 	 * Record the incoming interface name whenever we have one.
 	 */
 	if (m->m_pkthdr.rcvif) {
 		/*
 		 * Hide the actual interface name in there in the
 		 * sin_zero array. XXX This needs to be moved to a
 		 * different sockaddr type for divert, e.g.
 		 * sockaddr_div with multiple fields like
 		 * sockaddr_dl. Presently we have only 7 bytes
 		 * but that will do for now as most interfaces
 		 * are 4 or less + 2 or less bytes for unit.
 		 * There is probably a faster way of doing this,
 		 * possibly taking it from the sockaddr_dl on the iface.
 		 * This solves the problem of a P2P link and a LAN interface
 		 * having the same address, which can result in the wrong
 		 * interface being assigned to the packet when fed back
 		 * into the divert socket. Theoretically if the daemon saves
 		 * and re-uses the sockaddr_in as suggested in the man pages,
 		 * this iface name will come along for the ride.
 		 * (see div_output for the other half of this.)
 		 */
 		strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname,
 		    sizeof(divsrc.sin_zero));
 	}
 
 	/* Put packet on socket queue, if any */
 	nport = htons((uint16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
 	SLIST_FOREACH(dcb, &V_divhash[DIVHASH(nport)], dcb_next)
 		if (dcb->dcb_port == nport)
 			break;
 
 	if (dcb != NULL) {
 		struct socket *sa = dcb->dcb_socket;
 
 		SOCKBUF_LOCK(&sa->so_rcv);
 		if (sbappendaddr_locked(&sa->so_rcv,
 		    (struct sockaddr *)&divsrc, m, NULL) == 0) {
 			soroverflow_locked(sa);
 			m_freem(m);
 		} else {
 			sorwakeup_locked(sa);
 			DIVSTAT_INC(diverted);
 		}
 	} else {
 		DIVSTAT_INC(noport);
 		m_freem(m);
 	}
 }
 
 /*
  * Deliver packet back into the IP processing machinery.
  *
  * If no address specified, or address is 0.0.0.0, send to ip_output();
  * otherwise, send to ip_input() and mark as having been received on
  * the interface with that address.
  */
 static int
 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct epoch_tracker et;
 	struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 	const struct ip *ip;
 	struct m_tag *mtag;
 	struct ipfw_rule_ref *dt;
 	int error, family;
 
 	if (control)
 		m_freem(control);
 
 	/* Packet must have a header (but that's about it) */
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		m_freem(m);
 		return (EINVAL);
 	}
 
 	if (sin != NULL) {
 		if (sin->sin_family != AF_INET) {
 			m_freem(m);
 			return (EAFNOSUPPORT);
 		}
 		if (sin->sin_len != sizeof(*sin)) {
 			m_freem(m);
 			return (EINVAL);
 		}
 	}
 
 	/*
 	 * An mbuf may hasn't come from userland, but we pretend
 	 * that it has.
 	 */
 	m->m_pkthdr.rcvif = NULL;
 	m->m_nextpkt = NULL;
 	M_SETFIB(m, so->so_fibnum);
 
 	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
 	if (mtag == NULL) {
 		/* this should be normal */
 		mtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
 		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
 		if (mtag == NULL) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 		m_tag_prepend(m, mtag);
 	}
 	dt = (struct ipfw_rule_ref *)(mtag+1);
 
 	/* Loopback avoidance and state recovery */
 	if (sin) {
 		int i;
 
 		/* set the starting point. We provide a non-zero slot,
 		 * but a non_matching chain_id to skip that info and use
 		 * the rulenum/rule_id.
 		 */
 		dt->slot = 1; /* dummy, chain_id is invalid */
 		dt->chain_id = 0;
 		dt->rulenum = sin->sin_port+1; /* host format ? */
 		dt->rule_id = 0;
 		/* XXX: broken for IPv6 */
 		/*
 		 * Find receive interface with the given name, stuffed
 		 * (if it exists) in the sin_zero[] field.
 		 * The name is user supplied data so don't trust its size
 		 * or that it is zero terminated.
 		 */
 		for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++)
 			;
 		if ( i > 0 && i < sizeof(sin->sin_zero))
 			m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
 	}
 
 	ip = mtod(m, struct ip *);
 	switch (ip->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		family = AF_INET;
 		break;
 #endif
 #ifdef INET6
 	case IPV6_VERSION >> 4:
 		family = AF_INET6;
 		break;
 #endif
 	default:
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 
 	/* Reinject packet into the system as incoming or outgoing */
 	NET_EPOCH_ENTER(et);
 	if (!sin || sin->sin_addr.s_addr == 0) {
 		dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT;
 		error = div_output_outbound(family, so, m);
 	} else {
 		dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN;
 		error = div_output_inbound(family, so, m, sin);
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * Sends mbuf @m to the wire via ip[6]_output().
  *
  * Returns 0 on success or an errno value on failure.  @m is always consumed.
  */
 static int
 div_output_outbound(int family, struct socket *so, struct mbuf *m)
 {
 	int error;
 
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 	    {
 		struct ip *const ip = mtod(m, struct ip *);
 
 		/* Don't allow packet length sizes that will crash. */
 		if (((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
 			m_freem(m);
 			return (EINVAL);
 		}
 		break;
 	    }
 #endif
 #ifdef INET6
 	case AF_INET6:
 	    {
 		struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *);
 
 		/* Don't allow packet length sizes that will crash */
 		if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) {
 			m_freem(m);
 			return (EINVAL);
 		}
 		break;
 	    }
 #endif
 	}
 
 #ifdef MAC
 	mac_socket_create_mbuf(so, m);
 #endif
 
 	error = 0;
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 		error = ip_output(m, NULL, NULL,
 		    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0)
 		    | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 		break;
 #endif
 	}
 	if (error == 0)
 		DIVSTAT_INC(outbound);
 
 	return (error);
 }
 
 /*
  * Schedules mbuf @m for local processing via IPv4/IPv6 netisr queue.
  *
  * Returns 0 on success or an errno value on failure.  @m is always consumed.
  */
 static int
 div_output_inbound(int family, struct socket *so, struct mbuf *m,
     struct sockaddr_in *sin)
 {
 	struct ifaddr *ifa;
 
 	if (m->m_pkthdr.rcvif == NULL) {
 		/*
 		 * No luck with the name, check by IP address.
 		 * Clear the port and the ifname to make sure
 		 * there are no distractions for ifa_ifwithaddr.
 		 */
 
 		/* XXX: broken for IPv6 */
 		bzero(sin->sin_zero, sizeof(sin->sin_zero));
 		sin->sin_port = 0;
 		ifa = ifa_ifwithaddr((struct sockaddr *) sin);
 		if (ifa == NULL) {
 			m_freem(m);
 			return (EADDRNOTAVAIL);
 		}
 		m->m_pkthdr.rcvif = ifa->ifa_ifp;
 	}
 #ifdef MAC
 	mac_socket_create_mbuf(so, m);
 #endif
 	/* Send packet to input processing via netisr */
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 	    {
 		const struct ip *ip;
 
 		ip = mtod(m, struct ip *);
 		/*
 		 * Restore M_BCAST flag when destination address is
 		 * broadcast. It is expected by ip_tryforward().
 		 */
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
 			m->m_flags |= M_MCAST;
 		else if (in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			m->m_flags |= M_BCAST;
 		netisr_queue_src(NETISR_IP, (uintptr_t)so, m);
 		DIVSTAT_INC(inbound);
 		break;
 	    }
 #endif
 #ifdef INET6
 	case AF_INET6:
 		netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m);
 		DIVSTAT_INC(inbound);
 		break;
 #endif
 	default:
 		m_freem(m);
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 div_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct divcb *dcb;
 	int error;
 
 	if (td != NULL) {
 		error = priv_check(td, PRIV_NETINET_DIVERT);
 		if (error)
 			return (error);
 	}
 	error = soreserve(so, div_sendspace, div_recvspace);
 	if (error)
 		return error;
 	dcb = malloc(sizeof(*dcb), M_PCB, M_WAITOK);
 	dcb->dcb_bound = DCB_UNBOUND;
 	dcb->dcb_socket = so;
 	DIVERT_LOCK();
 	V_dcb_count++;
 	dcb->dcb_gencnt = ++V_dcb_gencnt;
 	DIVERT_UNLOCK();
 	so->so_pcb = dcb;
 
 	return (0);
 }
 
 static void
 div_free(epoch_context_t ctx)
 {
 	struct divcb *dcb = __containerof(ctx, struct divcb, dcb_epochctx);
 
 	free(dcb, M_PCB);
 }
 
 static void
 div_detach(struct socket *so)
 {
 	struct divcb *dcb = so->so_pcb;
 
 	so->so_pcb = NULL;
 	DIVERT_LOCK();
 	if (dcb->dcb_bound != DCB_UNBOUND)
 		SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next);
 	V_dcb_count--;
 	V_dcb_gencnt++;
 	DIVERT_UNLOCK();
 	NET_EPOCH_CALL(div_free, &dcb->dcb_epochctx);
 }
 
 static int
 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct divcb *dcb;
 	uint16_t port;
 
 	if (nam->sa_family != AF_INET)
 		return EAFNOSUPPORT;
 	if (nam->sa_len != sizeof(struct sockaddr_in))
 		return EINVAL;
 	port = ((struct sockaddr_in *)nam)->sin_port;
 	DIVERT_LOCK();
 	SLIST_FOREACH(dcb, &V_divhash[DIVHASH(port)], dcb_next)
 		if (dcb->dcb_port == port) {
 			DIVERT_UNLOCK();
 			return (EADDRINUSE);
 		}
 	dcb = so->so_pcb;
 	if (dcb->dcb_bound != DCB_UNBOUND)
 		SLIST_REMOVE(&V_divhash[DCBHASH(dcb)], dcb, divcb, dcb_next);
 	dcb->dcb_port = port;
 	SLIST_INSERT_HEAD(&V_divhash[DIVHASH(port)], dcb, dcb_next);
 	DIVERT_UNLOCK();
 
 	return (0);
 }
 
 static int
 div_shutdown(struct socket *so)
 {
 
 	socantsendmore(so);
 	return 0;
 }
 
 static int
 div_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	struct xinpgen xig;
 	struct divcb *dcb;
 	int error;
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	if (req->oldptr == 0) {
 		u_int n;
 
 		n = V_dcb_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
 		return (error);
 
 	bzero(&xig, sizeof(xig));
 	xig.xig_len = sizeof xig;
 	xig.xig_count = V_dcb_count;
 	xig.xig_gen = V_dcb_gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	DIVERT_LOCK();
 	for (int i = 0; i < DIVHASHSIZE; i++)
 		SLIST_FOREACH(dcb, &V_divhash[i], dcb_next) {
 			if (dcb->dcb_gencnt <= xig.xig_gen) {
 				struct xinpcb xi;
 
 				bzero(&xi, sizeof(xi));
 				xi.xi_len = sizeof(struct xinpcb);
 				sotoxsocket(dcb->dcb_socket, &xi.xi_socket);
 				xi.inp_gencnt = dcb->dcb_gencnt;
 				xi.inp_vflag = INP_IPV4; /* XXX: netstat(1) */
 				xi.inp_inc.inc_ie.ie_lport = dcb->dcb_port;
 				error = SYSCTL_OUT(req, &xi, sizeof xi);
 				if (error)
 					goto errout;
 			}
 		}
 
 	/*
 	 * Give the user an updated idea of our state.
 	 * If the generation differs from what we told
 	 * her before, she knows that something happened
 	 * while we were processing this request, and it
 	 * might be necessary to retry.
 	 */
 	xig.xig_gen = V_dcb_gencnt;
 	xig.xig_sogen = so_gencnt;
 	xig.xig_count = V_dcb_count;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 
 errout:
 	DIVERT_UNLOCK();
 
 	return (error);
 }
 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, div_pcblist,
     "S,xinpcb", "List of active divert sockets");
 
 static struct protosw div_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_attach =		div_attach,
 	.pr_bind =		div_bind,
 	.pr_detach =		div_detach,
 	.pr_send =		div_send,
 	.pr_shutdown =		div_shutdown,
 };
 
 static struct domain divertdomain = {
 	.dom_family =	PF_DIVERT,
 	.dom_name =	"divert",
 	.dom_nprotosw =	1,
 	.dom_protosw =	{ &div_protosw },
 };
 
 static int
 div_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		domain_add(&divertdomain);
 		ip_divert_ptr = divert_packet;
 		break;
 	case MOD_QUIESCE:
 		/*
 		 * IPDIVERT may normally not be unloaded because of the
 		 * potential race conditions.  Tell kldunload we can't be
 		 * unloaded unless the unload is forced.
 		 */
 		err = EPERM;
 		break;
 	case MOD_UNLOAD:
 		/*
 		 * Forced unload.
 		 *
 		 * Module ipdivert can only be unloaded if no sockets are
 		 * connected.  Maybe this can be changed later to forcefully
 		 * disconnect any open sockets.
 		 *
 		 * XXXRW: Note that there is a slight race here, as a new
 		 * socket open request could be spinning on the lock and then
 		 * we destroy the lock.
 		 *
 		 * XXXGL: One more reason this code is incorrect is that it
 		 * checks only the current vnet.
 		 */
 		DIVERT_LOCK();
 		if (V_dcb_count != 0) {
 			DIVERT_UNLOCK();
 			err = EBUSY;
 			break;
 		}
 		DIVERT_UNLOCK();
 		ip_divert_ptr = NULL;
 		domain_remove(&divertdomain);
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipdivertmod = {
         "ipdivert",
         div_modevent,
         0
 };
 
 DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
 MODULE_VERSION(ipdivert, 1);
diff --git a/sys/netinet/ip_fastfwd.c b/sys/netinet/ip_fastfwd.c
index ee97cb4af65a..121db3ddee3a 100644
--- a/sys/netinet/ip_fastfwd.c
+++ b/sys/netinet/ip_fastfwd.c
@@ -1,555 +1,556 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2003 Andre Oppermann, Internet Business Solutions AG
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * ip_fastforward gets its speed from processing the forwarded packet to
  * completion (if_output on the other side) without any queues or netisr's.
  * The receiving interface DMAs the packet into memory, the upper half of
  * driver calls ip_fastforward, we do our routing table lookup and directly
  * send it off to the outgoing interface, which DMAs the packet to the
  * network card. The only part of the packet we touch with the CPU is the
  * IP header (unless there are complex firewall rules touching other parts
  * of the packet, but that is up to you). We are essentially limited by bus
  * bandwidth and how fast the network card/driver can set up receives and
  * transmits.
  *
  * We handle basic errors, IP header errors, checksum errors,
  * destination unreachable, fragmentation and fragmentation needed and
  * report them via ICMP to the sender.
  *
  * Else if something is not pure IPv4 unicast forwarding we fall back to
  * the normal ip_input processing path. We should only be called from
  * interfaces connected to the outside world.
  *
  * Firewalling is fully supported including divert, ipfw fwd and ipfilter
  * ipnat and address rewrite.
  *
  * IPSEC is not supported if this host is a tunnel broker. IPSEC is
  * supported for connections to/from local host.
  *
  * We try to do the least expensive (in CPU ops) checks and operations
  * first to catch junk with as little overhead as possible.
  *
  * We take full advantage of hardware support for IP checksum and
  * fragmentation offloading.
  */
 
 /*
  * Many thanks to Matt Thomas of NetBSD for basic structure of ip_flow.c which
  * is being followed here.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipstealth.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
+#include <net/if_private.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_options.h>
 
 #include <machine/in_cksum.h>
 
 #define	V_ipsendredirects	VNET(ipsendredirects)
 
 static struct mbuf *
 ip_redir_alloc(struct mbuf *m, struct nhop_object *nh, u_short ip_len,
     struct in_addr *osrc, struct in_addr *newgw)
 {
 	struct in_ifaddr *nh_ia;
 	struct mbuf *mcopy;
 
 	KASSERT(nh != NULL, ("%s: m %p nh is NULL\n", __func__, m));
 
 	/*
 	 * Only send a redirect if:
 	 * - Redirects are not disabled (must be checked by caller),
 	 * - We have not applied NAT (must be checked by caller as possible),
 	 * - Neither a MCAST or BCAST packet (must be checked by caller)
 	 *   [RFC1009 Appendix A.2].
 	 * - The packet does not do IP source routing or having any other
 	 *   IP options (this case was handled already by ip_input() calling
 	 *   ip_dooptions() [RFC792, p13],
 	 * - The packet is being forwarded out the same physical interface
 	 *   that it was received from [RFC1812, 5.2.7.2].
 	 */
 
 	/*
 	 * - The forwarding route was not created by a redirect
 	 *   [RFC1812, 5.2.7.2], or
 	 *   if it was to follow a default route (see below).
 	 * - The next-hop is reachable by us [RFC1009 Appendix A.2].
 	 */
 	if ((nh->nh_flags & (NHF_DEFAULT | NHF_REDIRECT |
 	    NHF_BLACKHOLE | NHF_REJECT)) != 0)
 		return (NULL);
 
 	/* Get the new gateway. */
 	if ((nh->nh_flags & NHF_GATEWAY) == 0 || nh->gw_sa.sa_family != AF_INET)
 		return (NULL);
 	newgw->s_addr = nh->gw4_sa.sin_addr.s_addr;
 
 	/*
 	 * - The resulting forwarding destination is not "This host on this
 	 *   network" [RFC1122, Section 3.2.1.3] (default route check above).
 	 */
 	if (newgw->s_addr == 0)
 		return (NULL);
 
 	/*
 	 * - We know how to reach the sender and the source address is
 	 *   directly connected to us [RFC792, p13].
 	 * + The new gateway address and the source address are on the same
 	 *   subnet [RFC1009 Appendix A.2, RFC1122 3.2.2.2, RFC1812, 5.2.7.2].
 	 * NB: if you think multiple logical subnets on the same wire should
 	 *     receive redirects read [RFC1812, APPENDIX C (14->15)].
 	 */
 	nh_ia = (struct in_ifaddr *)nh->nh_ifa;
 	if ((ntohl(osrc->s_addr) & nh_ia->ia_subnetmask) != nh_ia->ia_subnet)
 		return (NULL);
 
 	/* Prepare for sending the redirect. */
 
 	/*
 	 * Make a copy of as much as we need of the packet as the original
 	 * one will be forwarded but we need (a portion) for icmp_error().
 	 */
 	mcopy = m_gethdr(M_NOWAIT, m->m_type);
 	if (mcopy == NULL)
 		return (NULL);
 
 	if (m_dup_pkthdr(mcopy, m, M_NOWAIT) == 0) {
 		/*
 		 * It's probably ok if the pkthdr dup fails (because
 		 * the deep copy of the tag chain failed), but for now
 		 * be conservative and just discard the copy since
 		 * code below may some day want the tags.
 		 */
 		m_free(mcopy);
 		return (NULL);
 	}
 	mcopy->m_len = min(ip_len, M_TRAILINGSPACE(mcopy));
 	mcopy->m_pkthdr.len = mcopy->m_len;
 	m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
 
 	return (mcopy);
 }
 
 
 static int
 ip_findroute(struct nhop_object **pnh, struct in_addr dest, struct mbuf *m)
 {
 	struct nhop_object *nh;
 
 	nh = fib4_lookup(M_GETFIB(m), dest, 0, NHR_NONE,
 	    m->m_pkthdr.flowid);
 	if (nh == NULL) {
 		IPSTAT_INC(ips_noroute);
 		IPSTAT_INC(ips_cantforward);
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		return (EHOSTUNREACH);
 	}
 	/*
 	 * Drop blackholed traffic and directed broadcasts.
 	 */
 	if ((nh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST)) != 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return (EHOSTUNREACH);
 	}
 
 	if (nh->nh_flags & NHF_REJECT) {
 		IPSTAT_INC(ips_cantforward);
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		return (EHOSTUNREACH);
 	}
 
 	*pnh = nh;
 
 	return (0);
 }
 
 /*
  * Try to forward a packet based on the destination address.
  * This is a fast path optimized for the plain forwarding case.
  * If the packet is handled (and consumed) here then we return NULL;
  * otherwise mbuf is returned and the packet should be delivered
  * to ip_input for full processing.
  */
 struct mbuf *
 ip_tryforward(struct mbuf *m)
 {
 	struct ip *ip;
 	struct mbuf *m0 = NULL;
 	struct nhop_object *nh = NULL;
 	struct route ro;
 	struct sockaddr_in *dst;
 	const struct sockaddr *gw;
 	struct in_addr dest, odest, rtdest, osrc;
 	uint16_t ip_len, ip_off;
 	int error = 0;
 	struct m_tag *fwd_tag = NULL;
 	struct mbuf *mcopy = NULL;
 	struct in_addr redest;
 	/*
 	 * Are we active and forwarding packets?
 	 */
 
 	M_ASSERTVALID(m);
 	M_ASSERTPKTHDR(m);
 
 	/*
 	 * Only IP packets without options
 	 */
 	ip = mtod(m, struct ip *);
 
 	if (ip->ip_hl != (sizeof(struct ip) >> 2)) {
 		if (V_ip_doopts == 1)
 			return m;
 		else if (V_ip_doopts == 2) {
 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB,
 				0, 0);
 			return NULL;	/* mbuf already free'd */
 		}
 		/* else ignore IP options and continue */
 	}
 
 	/*
 	 * Only unicast IP, not from loopback, no L2 or IP broadcast,
 	 * no multicast, no INADDR_ANY
 	 *
 	 * XXX: Probably some of these checks could be direct drop
 	 * conditions.  However it is not clear whether there are some
 	 * hacks or obscure behaviours which make it necessary to
 	 * let ip_input handle it.  We play safe here and let ip_input
 	 * deal with it until it is proven that we can directly drop it.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST)) ||
 	    (m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) ||
 	    ntohl(ip->ip_src.s_addr) == (u_long)INADDR_BROADCAST ||
 	    ntohl(ip->ip_dst.s_addr) == (u_long)INADDR_BROADCAST ||
 	    IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 	    IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
 	    IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
 	    ip->ip_src.s_addr == INADDR_ANY ||
 	    ip->ip_dst.s_addr == INADDR_ANY )
 		return m;
 
 	/*
 	 * Is it for a local address on this host?
 	 */
 	if (in_localip(ip->ip_dst))
 		return m;
 
 	IPSTAT_INC(ips_total);
 
 	/*
 	 * Step 3: incoming packet firewall processing
 	 */
 
 	odest.s_addr = dest.s_addr = ip->ip_dst.s_addr;
 	osrc.s_addr = ip->ip_src.s_addr;
 
 	/*
 	 * Run through list of ipfilter hooks for input packets
 	 */
 	if (!PFIL_HOOKED_IN(V_inet_pfil_head))
 		goto passin;
 
 	if (pfil_mbuf_in(V_inet_pfil_head, &m, m->m_pkthdr.rcvif,
 	    NULL) != PFIL_PASS)
 		goto drop;
 
 	M_ASSERTVALID(m);
 	M_ASSERTPKTHDR(m);
 
 	ip = mtod(m, struct ip *);	/* m may have changed by pfil hook */
 	dest.s_addr = ip->ip_dst.s_addr;
 
 	/*
 	 * Destination address changed?
 	 */
 	if (odest.s_addr != dest.s_addr) {
 		/*
 		 * Is it now for a local address on this host?
 		 */
 		if (in_localip(dest))
 			goto forwardlocal;
 		/*
 		 * Go on with new destination address
 		 */
 	}
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		/*
 		 * ipfw changed it for a local address on this host.
 		 */
 		goto forwardlocal;
 	}
 
 passin:
 	/*
 	 * Step 4: decrement TTL and look up route
 	 */
 
 	/*
 	 * Check TTL
 	 */
 #ifdef IPSTEALTH
 	if (!V_ipstealth) {
 #endif
 	if (ip->ip_ttl <= IPTTLDEC) {
 		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0);
 		return NULL;	/* mbuf already free'd */
 	}
 
 	/*
 	 * Decrement the TTL and incrementally change the IP header checksum.
 	 * Don't bother doing this with hw checksum offloading, it's faster
 	 * doing it right here.
 	 */
 	ip->ip_ttl -= IPTTLDEC;
 	if (ip->ip_sum >= (u_int16_t) ~htons(IPTTLDEC << 8))
 		ip->ip_sum -= ~htons(IPTTLDEC << 8);
 	else
 		ip->ip_sum += htons(IPTTLDEC << 8);
 #ifdef IPSTEALTH
 	}
 #endif
 
 	/*
 	 * Next hop forced by pfil(9) hook?
 	 */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
 		/*
 		 * Now we will find route to forced destination.
 		 */
 		dest.s_addr = ((struct sockaddr_in *)
 			    (fwd_tag + 1))->sin_addr.s_addr;
 		m_tag_delete(m, fwd_tag);
 		m->m_flags &= ~M_IP_NEXTHOP;
 	}
 
 	/*
 	 * Find route to destination.
 	 */
 	if (ip_findroute(&nh, dest, m) != 0)
 		return (NULL);	/* icmp unreach already sent */
 
 	/*
 	 * Avoid second route lookup by caching destination.
 	 */
 	rtdest.s_addr = dest.s_addr;
 
 	/*
 	 * Step 5: outgoing firewall packet processing
 	 */
 	if (!PFIL_HOOKED_OUT(V_inet_pfil_head))
 		goto passout;
 
 	if (pfil_mbuf_out(V_inet_pfil_head, &m, nh->nh_ifp,
 	    NULL) != PFIL_PASS)
 		goto drop;
 
 	M_ASSERTVALID(m);
 	M_ASSERTPKTHDR(m);
 
 	ip = mtod(m, struct ip *);
 	dest.s_addr = ip->ip_dst.s_addr;
 
 	/*
 	 * Destination address changed?
 	 */
 	if (m->m_flags & M_IP_NEXTHOP)
 		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 	else
 		fwd_tag = NULL;
 	if (odest.s_addr != dest.s_addr || fwd_tag != NULL) {
 		/*
 		 * Is it now for a local address on this host?
 		 */
 		if (m->m_flags & M_FASTFWD_OURS || in_localip(dest)) {
 forwardlocal:
 			/*
 			 * Return packet for processing by ip_input().
 			 */
 			m->m_flags |= M_FASTFWD_OURS;
 			return (m);
 		}
 		/*
 		 * Redo route lookup with new destination address
 		 */
 		if (fwd_tag) {
 			dest.s_addr = ((struct sockaddr_in *)
 				    (fwd_tag + 1))->sin_addr.s_addr;
 			m_tag_delete(m, fwd_tag);
 			m->m_flags &= ~M_IP_NEXTHOP;
 		}
 		if (dest.s_addr != rtdest.s_addr &&
 		    ip_findroute(&nh, dest, m) != 0)
 			return (NULL);	/* icmp unreach already sent */
 	}
 
 passout:
 	/*
 	 * Step 6: send off the packet
 	 */
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	bzero(&ro, sizeof(ro));
 	dst = (struct sockaddr_in *)&ro.ro_dst;
 	dst->sin_family = AF_INET;
 	dst->sin_len = sizeof(*dst);
 	dst->sin_addr = dest;
 	if (nh->nh_flags & NHF_GATEWAY) {
 		gw = &nh->gw_sa;
 		ro.ro_flags |= RT_HAS_GW;
 	} else
 		gw = (const struct sockaddr *)dst;
 
 	/* Handle redirect case. */
 	redest.s_addr = 0;
 	if (V_ipsendredirects && osrc.s_addr == ip->ip_src.s_addr &&
 	    nh->nh_ifp == m->m_pkthdr.rcvif)
 		mcopy = ip_redir_alloc(m, nh, ip_len, &osrc, &redest);
 
 	/*
 	 * Check if packet fits MTU or if hardware will fragment for us
 	 */
 	if (ip_len <= nh->nh_mtu) {
 		/*
 		 * Avoid confusing lower layers.
 		 */
 		m_clrprotoflags(m);
 		/*
 		 * Send off the packet via outgoing interface
 		 */
 		IP_PROBE(send, NULL, NULL, ip, nh->nh_ifp, ip, NULL);
 		error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m, gw, &ro);
 	} else {
 		/*
 		 * Handle EMSGSIZE with icmp reply needfrag for TCP MTU discovery
 		 */
 		if (ip_off & IP_DF) {
 			IPSTAT_INC(ips_cantfrag);
 			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
 				0, nh->nh_mtu);
 			goto consumed;
 		} else {
 			/*
 			 * We have to fragment the packet
 			 */
 			m->m_pkthdr.csum_flags |= CSUM_IP;
 			if (ip_fragment(ip, &m, nh->nh_mtu,
 			    nh->nh_ifp->if_hwassist) != 0)
 				goto drop;
 			KASSERT(m != NULL, ("null mbuf and no error"));
 			/*
 			 * Send off the fragments via outgoing interface
 			 */
 			error = 0;
 			do {
 				m0 = m->m_nextpkt;
 				m->m_nextpkt = NULL;
 				/*
 				 * Avoid confusing lower layers.
 				 */
 				m_clrprotoflags(m);
 
 				IP_PROBE(send, NULL, NULL,
 				    mtod(m, struct ip *), nh->nh_ifp,
 				    mtod(m, struct ip *), NULL);
 				error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m,
 				    gw, &ro);
 				if (error)
 					break;
 			} while ((m = m0) != NULL);
 			if (error) {
 				/* Reclaim remaining fragments */
 				for (m = m0; m; m = m0) {
 					m0 = m->m_nextpkt;
 					m_freem(m);
 				}
 			} else
 				IPSTAT_INC(ips_fragmented);
 		}
 	}
 
 	if (error != 0)
 		IPSTAT_INC(ips_odropped);
 	else {
 		IPSTAT_INC(ips_forward);
 		IPSTAT_INC(ips_fastforward);
 	}
 
 	/* Send required redirect */
 	if (mcopy != NULL) {
 		icmp_error(mcopy, ICMP_REDIRECT, ICMP_REDIRECT_HOST, redest.s_addr, 0);
 		mcopy = NULL; /* Was consumed by callee. */
 	}
 
 consumed:
 	if (mcopy != NULL)
 		m_freem(mcopy);
 	return NULL;
 drop:
 	if (m)
 		m_freem(m);
 	return NULL;
 }
diff --git a/sys/netinet/ip_gre.c b/sys/netinet/ip_gre.c
index 12c13cbce2e7..f4be4252a822 100644
--- a/sys/netinet/ip_gre.c
+++ b/sys/netinet/ip_gre.c
@@ -1,583 +1,584 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-NetBSD
  *
  * Copyright (c) 1998 The NetBSD Foundation, Inc.
  * Copyright (c) 2014, 2018 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Heiko W.Rupp <hwr@pilhuhn.de>
  *
  * IPv6-over-GRE contributed by Gert Doering <gert@greenie.muc.de>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $NetBSD: ip_gre.c,v 1.29 2003/09/05 23:02:43 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 
 #include <net/if_gre.h>
 #include <machine/in_cksum.h>
 
 #define	GRE_TTL			30
 VNET_DEFINE(int, ip_gre_ttl) = GRE_TTL;
 #define	V_ip_gre_ttl		VNET(ip_gre_ttl)
 SYSCTL_INT(_net_inet_ip, OID_AUTO, grettl, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_gre_ttl), 0, "Default TTL value for encapsulated packets");
 
 struct in_gre_socket {
 	struct gre_socket		base;
 	in_addr_t			addr;
 };
 VNET_DEFINE_STATIC(struct gre_sockets *, ipv4_sockets) = NULL;
 VNET_DEFINE_STATIC(struct gre_list *, ipv4_hashtbl) = NULL;
 VNET_DEFINE_STATIC(struct gre_list *, ipv4_srchashtbl) = NULL;
 #define	V_ipv4_sockets		VNET(ipv4_sockets)
 #define	V_ipv4_hashtbl		VNET(ipv4_hashtbl)
 #define	V_ipv4_srchashtbl	VNET(ipv4_srchashtbl)
 #define	GRE_HASH(src, dst)	(V_ipv4_hashtbl[\
     in_gre_hashval((src), (dst)) & (GRE_HASH_SIZE - 1)])
 #define	GRE_SRCHASH(src)	(V_ipv4_srchashtbl[\
     fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)])
 #define	GRE_SOCKHASH(src)	(V_ipv4_sockets[\
     fnv_32_buf(&(src), sizeof(src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)])
 #define	GRE_HASH_SC(sc)		GRE_HASH((sc)->gre_oip.ip_src.s_addr,\
     (sc)->gre_oip.ip_dst.s_addr)
 
 static uint32_t
 in_gre_hashval(in_addr_t src, in_addr_t dst)
 {
 	uint32_t ret;
 
 	ret = fnv_32_buf(&src, sizeof(src), FNV1_32_INIT);
 	return (fnv_32_buf(&dst, sizeof(dst), ret));
 }
 
 static struct gre_socket*
 in_gre_lookup_socket(in_addr_t addr)
 {
 	struct gre_socket *gs;
 	struct in_gre_socket *s;
 
 	CK_LIST_FOREACH(gs, &GRE_SOCKHASH(addr), chain) {
 		s = __containerof(gs, struct in_gre_socket, base);
 		if (s->addr == addr)
 			break;
 	}
 	return (gs);
 }
 
 static int
 in_gre_checkdup(const struct gre_softc *sc, in_addr_t src, in_addr_t dst,
     uint32_t opts)
 {
 	struct gre_list *head;
 	struct gre_softc *tmp;
 	struct gre_socket *gs;
 
 	if (sc->gre_family == AF_INET &&
 	    sc->gre_oip.ip_src.s_addr == src &&
 	    sc->gre_oip.ip_dst.s_addr == dst &&
 	    (sc->gre_options & GRE_UDPENCAP) == (opts & GRE_UDPENCAP))
 		return (EEXIST);
 
 	if (opts & GRE_UDPENCAP) {
 		gs = in_gre_lookup_socket(src);
 		if (gs == NULL)
 			return (0);
 		head = &gs->list;
 	} else
 		head = &GRE_HASH(src, dst);
 
 	CK_LIST_FOREACH(tmp, head, chain) {
 		if (tmp == sc)
 			continue;
 		if (tmp->gre_oip.ip_src.s_addr == src &&
 		    tmp->gre_oip.ip_dst.s_addr == dst)
 			return (EADDRNOTAVAIL);
 	}
 	return (0);
 }
 
 static int
 in_gre_lookup(const struct mbuf *m, int off, int proto, void **arg)
 {
 	const struct ip *ip;
 	struct gre_softc *sc;
 
 	if (V_ipv4_hashtbl == NULL)
 		return (0);
 
 	NET_EPOCH_ASSERT();
 	ip = mtod(m, const struct ip *);
 	CK_LIST_FOREACH(sc, &GRE_HASH(ip->ip_dst.s_addr,
 	    ip->ip_src.s_addr), chain) {
 		/*
 		 * This is an inbound packet, its ip_dst is source address
 		 * in softc.
 		 */
 		if (sc->gre_oip.ip_src.s_addr == ip->ip_dst.s_addr &&
 		    sc->gre_oip.ip_dst.s_addr == ip->ip_src.s_addr) {
 			if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0)
 				return (0);
 			*arg = sc;
 			return (ENCAP_DRV_LOOKUP);
 		}
 	}
 	return (0);
 }
 
 /*
  * Check that ingress address belongs to local host.
  */
 static void
 in_gre_set_running(struct gre_softc *sc)
 {
 
 	if (in_localip(sc->gre_oip.ip_src))
 		GRE2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING;
 	else
 		GRE2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 }
 
 /*
  * ifaddr_event handler.
  * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent
  * source address spoofing.
  */
 static void
 in_gre_srcaddr(void *arg __unused, const struct sockaddr *sa,
     int event __unused)
 {
 	const struct sockaddr_in *sin;
 	struct gre_softc *sc;
 
 	/* Check that VNET is ready */
 	if (V_ipv4_hashtbl == NULL)
 		return;
 
 	NET_EPOCH_ASSERT();
 	sin = (const struct sockaddr_in *)sa;
 	CK_LIST_FOREACH(sc, &GRE_SRCHASH(sin->sin_addr.s_addr), srchash) {
 		if (sc->gre_oip.ip_src.s_addr != sin->sin_addr.s_addr)
 			continue;
 		in_gre_set_running(sc);
 	}
 }
 
 static bool
 in_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp,
     const struct sockaddr *sa, void *ctx)
 {
 	struct gre_socket *gs;
 	struct gre_softc *sc;
 	in_addr_t dst;
 
 	NET_EPOCH_ASSERT();
 
 	gs = (struct gre_socket *)ctx;
 	dst = ((const struct sockaddr_in *)sa)->sin_addr.s_addr;
 	CK_LIST_FOREACH(sc, &gs->list, chain) {
 		if (sc->gre_oip.ip_dst.s_addr == dst)
 			break;
 	}
 	if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){
 		gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc);
 		return (true);
 	}
 	m_freem(m);
 
 	return (true);
 }
 
 static int
 in_gre_setup_socket(struct gre_softc *sc)
 {
 	struct sockopt sopt;
 	struct sockaddr_in sin;
 	struct in_gre_socket *s;
 	struct gre_socket *gs;
 	in_addr_t addr;
 	int error, value;
 
 	/*
 	 * NOTE: we are protected with gre_ioctl_sx lock.
 	 *
 	 * First check that socket is already configured.
 	 * If so, check that source address was not changed.
 	 * If address is different, check that there are no other tunnels
 	 * and close socket.
 	 */
 	addr = sc->gre_oip.ip_src.s_addr;
 	gs = sc->gre_so;
 	if (gs != NULL) {
 		s = __containerof(gs, struct in_gre_socket, base);
 		if (s->addr != addr) {
 			if (CK_LIST_EMPTY(&gs->list)) {
 				CK_LIST_REMOVE(gs, chain);
 				soclose(gs->so);
 				NET_EPOCH_CALL(gre_sofree, &gs->epoch_ctx);
 			}
 			gs = sc->gre_so = NULL;
 		}
 	}
 
 	if (gs == NULL) {
 		/*
 		 * Check that socket for given address is already
 		 * configured.
 		 */
 		gs = in_gre_lookup_socket(addr);
 		if (gs == NULL) {
 			s = malloc(sizeof(*s), M_GRE, M_WAITOK | M_ZERO);
 			s->addr = addr;
 			gs = &s->base;
 
 			error = socreate(sc->gre_family, &gs->so,
 			    SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred,
 			    curthread);
 			if (error != 0) {
 				if_printf(GRE2IFP(sc),
 				    "cannot create socket: %d\n", error);
 				free(s, M_GRE);
 				return (error);
 			}
 
 			error = udp_set_kernel_tunneling(gs->so,
 			    in_gre_udp_input, NULL, gs);
 			if (error != 0) {
 				if_printf(GRE2IFP(sc),
 				    "cannot set UDP tunneling: %d\n", error);
 				goto fail;
 			}
 
 			memset(&sopt, 0, sizeof(sopt));
 			sopt.sopt_dir = SOPT_SET;
 			sopt.sopt_level = IPPROTO_IP;
 			sopt.sopt_name = IP_BINDANY;
 			sopt.sopt_val = &value;
 			sopt.sopt_valsize = sizeof(value);
 			value = 1;
 			error = sosetopt(gs->so, &sopt);
 			if (error != 0) {
 				if_printf(GRE2IFP(sc),
 				    "cannot set IP_BINDANY opt: %d\n", error);
 				goto fail;
 			}
 
 			memset(&sin, 0, sizeof(sin));
 			sin.sin_family = AF_INET;
 			sin.sin_len = sizeof(sin);
 			sin.sin_addr.s_addr = addr;
 			sin.sin_port = htons(GRE_UDPPORT);
 			error = sobind(gs->so, (struct sockaddr *)&sin,
 			    curthread);
 			if (error != 0) {
 				if_printf(GRE2IFP(sc),
 				    "cannot bind socket: %d\n", error);
 				goto fail;
 			}
 			/* Add socket to the chain */
 			CK_LIST_INSERT_HEAD(&GRE_SOCKHASH(addr), gs, chain);
 		}
 	}
 
 	/* Add softc to the socket's list */
 	CK_LIST_INSERT_HEAD(&gs->list, sc, chain);
 	sc->gre_so = gs;
 	return (0);
 fail:
 	soclose(gs->so);
 	free(s, M_GRE);
 	return (error);
 }
 
 static int
 in_gre_attach(struct gre_softc *sc)
 {
 	struct epoch_tracker et;
 	struct grehdr *gh;
 	int error;
 
 	if (sc->gre_options & GRE_UDPENCAP) {
 		sc->gre_csumflags = CSUM_UDP;
 		sc->gre_hlen = sizeof(struct greudp);
 		sc->gre_oip.ip_p = IPPROTO_UDP;
 		gh = &sc->gre_udphdr->gi_gre;
 		gre_update_udphdr(sc, &sc->gre_udp,
 		    in_pseudo(sc->gre_oip.ip_src.s_addr,
 		    sc->gre_oip.ip_dst.s_addr, 0));
 	} else {
 		sc->gre_hlen = sizeof(struct greip);
 		sc->gre_oip.ip_p = IPPROTO_GRE;
 		gh = &sc->gre_iphdr->gi_gre;
 	}
 	sc->gre_oip.ip_v = IPVERSION;
 	sc->gre_oip.ip_hl = sizeof(struct ip) >> 2;
 	gre_update_hdr(sc, gh);
 
 	/*
 	 * If we return error, this means that sc is not linked,
 	 * and caller should reset gre_family and free(sc->gre_hdr).
 	 */
 	if (sc->gre_options & GRE_UDPENCAP) {
 		error = in_gre_setup_socket(sc);
 		if (error != 0)
 			return (error);
 	} else
 		CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain);
 	CK_LIST_INSERT_HEAD(&GRE_SRCHASH(sc->gre_oip.ip_src.s_addr),
 	    sc, srchash);
 
 	/* Set IFF_DRV_RUNNING if interface is ready */
 	NET_EPOCH_ENTER(et);
 	in_gre_set_running(sc);
 	NET_EPOCH_EXIT(et);
 	return (0);
 }
 
 int
 in_gre_setopts(struct gre_softc *sc, u_long cmd, uint32_t value)
 {
 	int error;
 
 	/* NOTE: we are protected with gre_ioctl_sx lock */
 	MPASS(cmd == GRESKEY || cmd == GRESOPTS || cmd == GRESPORT);
 	MPASS(sc->gre_family == AF_INET);
 
 	/*
 	 * If we are going to change encapsulation protocol, do check
 	 * for duplicate tunnels. Return EEXIST here to do not confuse
 	 * user.
 	 */
 	if (cmd == GRESOPTS &&
 	    (sc->gre_options & GRE_UDPENCAP) != (value & GRE_UDPENCAP) &&
 	    in_gre_checkdup(sc, sc->gre_oip.ip_src.s_addr,
 		sc->gre_oip.ip_dst.s_addr, value) == EADDRNOTAVAIL)
 		return (EEXIST);
 
 	CK_LIST_REMOVE(sc, chain);
 	CK_LIST_REMOVE(sc, srchash);
 	GRE_WAIT();
 	switch (cmd) {
 	case GRESKEY:
 		sc->gre_key = value;
 		break;
 	case GRESOPTS:
 		sc->gre_options = value;
 		break;
 	case GRESPORT:
 		sc->gre_port = value;
 		break;
 	}
 	error = in_gre_attach(sc);
 	if (error != 0) {
 		sc->gre_family = 0;
 		free(sc->gre_hdr, M_GRE);
 	}
 	return (error);
 }
 
 int
 in_gre_ioctl(struct gre_softc *sc, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct sockaddr_in *dst, *src;
 	struct ip *ip;
 	int error;
 
 	/* NOTE: we are protected with gre_ioctl_sx lock */
 	error = EINVAL;
 	switch (cmd) {
 	case SIOCSIFPHYADDR:
 		src = &((struct in_aliasreq *)data)->ifra_addr;
 		dst = &((struct in_aliasreq *)data)->ifra_dstaddr;
 
 		/* sanity checks */
 		if (src->sin_family != dst->sin_family ||
 		    src->sin_family != AF_INET ||
 		    src->sin_len != dst->sin_len ||
 		    src->sin_len != sizeof(*src))
 			break;
 		if (src->sin_addr.s_addr == INADDR_ANY ||
 		    dst->sin_addr.s_addr == INADDR_ANY) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		if (V_ipv4_hashtbl == NULL) {
 			V_ipv4_hashtbl = gre_hashinit();
 			V_ipv4_srchashtbl = gre_hashinit();
 			V_ipv4_sockets = (struct gre_sockets *)gre_hashinit();
 		}
 		error = in_gre_checkdup(sc, src->sin_addr.s_addr,
 		    dst->sin_addr.s_addr, sc->gre_options);
 		if (error == EADDRNOTAVAIL)
 			break;
 		if (error == EEXIST) {
 			/* Addresses are the same. Just return. */
 			error = 0;
 			break;
 		}
 		ip = malloc(sizeof(struct greudp) + 3 * sizeof(uint32_t),
 		    M_GRE, M_WAITOK | M_ZERO);
 		ip->ip_src.s_addr = src->sin_addr.s_addr;
 		ip->ip_dst.s_addr = dst->sin_addr.s_addr;
 		if (sc->gre_family != 0) {
 			/* Detach existing tunnel first */
 			CK_LIST_REMOVE(sc, chain);
 			CK_LIST_REMOVE(sc, srchash);
 			GRE_WAIT();
 			free(sc->gre_hdr, M_GRE);
 			/* XXX: should we notify about link state change? */
 		}
 		sc->gre_family = AF_INET;
 		sc->gre_hdr = ip;
 		sc->gre_oseq = 0;
 		sc->gre_iseq = UINT32_MAX;
 		error = in_gre_attach(sc);
 		if (error != 0) {
 			sc->gre_family = 0;
 			free(sc->gre_hdr, M_GRE);
 		}
 		break;
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 		if (sc->gre_family != AF_INET) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		src = (struct sockaddr_in *)&ifr->ifr_addr;
 		memset(src, 0, sizeof(*src));
 		src->sin_family = AF_INET;
 		src->sin_len = sizeof(*src);
 		src->sin_addr = (cmd == SIOCGIFPSRCADDR) ?
 		    sc->gre_oip.ip_src: sc->gre_oip.ip_dst;
 		error = prison_if(curthread->td_ucred, (struct sockaddr *)src);
 		if (error != 0)
 			memset(src, 0, sizeof(*src));
 		break;
 	}
 	return (error);
 }
 
 int
 in_gre_output(struct mbuf *m, int af, int hlen)
 {
 	struct greip *gi;
 
 	gi = mtod(m, struct greip *);
 	switch (af) {
 	case AF_INET:
 		/*
 		 * gre_transmit() has used M_PREPEND() that doesn't guarantee
 		 * m_data is contiguous more than hlen bytes. Use m_copydata()
 		 * here to avoid m_pullup().
 		 */
 		m_copydata(m, hlen + offsetof(struct ip, ip_tos),
 		    sizeof(u_char), &gi->gi_ip.ip_tos);
 		m_copydata(m, hlen + offsetof(struct ip, ip_id),
 		    sizeof(u_short), (caddr_t)&gi->gi_ip.ip_id);
 		break;
 #ifdef INET6
 	case AF_INET6:
 		gi->gi_ip.ip_tos = 0; /* XXX */
 		ip_fillid(&gi->gi_ip);
 		break;
 #endif
 	}
 	gi->gi_ip.ip_ttl = V_ip_gre_ttl;
 	gi->gi_ip.ip_len = htons(m->m_pkthdr.len);
 	return (ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL));
 }
 
 static const struct srcaddrtab *ipv4_srcaddrtab = NULL;
 static const struct encaptab *ecookie = NULL;
 static const struct encap_config ipv4_encap_cfg = {
 	.proto = IPPROTO_GRE,
 	.min_length = sizeof(struct greip) + sizeof(struct ip),
 	.exact_match = ENCAP_DRV_LOOKUP,
 	.lookup = in_gre_lookup,
 	.input = gre_input
 };
 
 void
 in_gre_init(void)
 {
 
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 	ipv4_srcaddrtab = ip_encap_register_srcaddr(in_gre_srcaddr,
 	    NULL, M_WAITOK);
 	ecookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK);
 }
 
 void
 in_gre_uninit(void)
 {
 
 	if (IS_DEFAULT_VNET(curvnet)) {
 		ip_encap_detach(ecookie);
 		ip_encap_unregister_srcaddr(ipv4_srcaddrtab);
 	}
 	if (V_ipv4_hashtbl != NULL) {
 		gre_hashdestroy(V_ipv4_hashtbl);
 		V_ipv4_hashtbl = NULL;
 		GRE_WAIT();
 		gre_hashdestroy(V_ipv4_srchashtbl);
 		gre_hashdestroy((struct gre_list *)V_ipv4_sockets);
 	}
 }
diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c
index c108a8f3ab05..98f290486ec5 100644
--- a/sys/netinet/ip_icmp.c
+++ b/sys/netinet/ip_icmp.c
@@ -1,1169 +1,1170 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_icmp.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/sctp.h>
 #include <netinet/tcp.h>
 #include <netinet/tcpip.h>
 #include <netinet/icmp_var.h>
 
 #ifdef INET
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 #endif /* INET */
 
 extern ipproto_ctlinput_t	*ip_ctlprotox[];
 
 /*
  * ICMP routines: error generation, receive packet processing, and
  * routines to turnaround packets back to the originator, and
  * host table maintenance routines.
  */
 VNET_DEFINE_STATIC(int, icmplim) = 200;
 #define	V_icmplim			VNET(icmplim)
 SYSCTL_INT(_net_inet_icmp, ICMPCTL_ICMPLIM, icmplim, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmplim), 0,
 	"Maximum number of ICMP responses per second");
 
 VNET_DEFINE_STATIC(int, icmplim_curr_jitter) = 0;
 #define V_icmplim_curr_jitter		VNET(icmplim_curr_jitter)
 VNET_DEFINE_STATIC(int, icmplim_jitter) = 16;
 #define	V_icmplim_jitter		VNET(icmplim_jitter)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_jitter, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmplim_jitter), 0,
 	"Random icmplim jitter adjustment limit");
 
 VNET_DEFINE_STATIC(int, icmplim_output) = 1;
 #define	V_icmplim_output		VNET(icmplim_output)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, icmplim_output, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmplim_output), 0,
 	"Enable logging of ICMP response rate limiting");
 
 #ifdef INET
 VNET_PCPUSTAT_DEFINE(struct icmpstat, icmpstat);
 VNET_PCPUSTAT_SYSINIT(icmpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_icmp, ICMPCTL_STATS, stats, struct icmpstat,
     icmpstat, "ICMP statistics (struct icmpstat, netinet/icmp_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(icmpstat);
 #endif /* VIMAGE */
 
 VNET_DEFINE_STATIC(int, icmpmaskrepl) = 0;
 #define	V_icmpmaskrepl			VNET(icmpmaskrepl)
 SYSCTL_INT(_net_inet_icmp, ICMPCTL_MASKREPL, maskrepl, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpmaskrepl), 0,
 	"Reply to ICMP Address Mask Request packets");
 
 VNET_DEFINE_STATIC(u_int, icmpmaskfake) = 0;
 #define	V_icmpmaskfake			VNET(icmpmaskfake)
 SYSCTL_UINT(_net_inet_icmp, OID_AUTO, maskfake, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpmaskfake), 0,
 	"Fake reply to ICMP Address Mask Request packets");
 
 VNET_DEFINE(int, drop_redirect) = 0;
 #define	V_drop_redirect			VNET(drop_redirect)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, drop_redirect, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(drop_redirect), 0,
 	"Ignore ICMP redirects");
 
 VNET_DEFINE_STATIC(int, log_redirect) = 0;
 #define	V_log_redirect			VNET(log_redirect)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, log_redirect, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(log_redirect), 0,
 	"Log ICMP redirects to the console");
 
 VNET_DEFINE_STATIC(int, redirtimeout) = 60 * 10; /* 10 minutes */
 #define	V_redirtimeout			VNET(redirtimeout)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, redirtimeout, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(redirtimeout), 0,
 	"Delay in seconds before expiring redirect route");
 
 VNET_DEFINE_STATIC(char, reply_src[IFNAMSIZ]);
 #define	V_reply_src			VNET(reply_src)
 SYSCTL_STRING(_net_inet_icmp, OID_AUTO, reply_src, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(reply_src), IFNAMSIZ,
 	"ICMP reply source for non-local packets");
 
 VNET_DEFINE_STATIC(int, icmp_rfi) = 0;
 #define	V_icmp_rfi			VNET(icmp_rfi)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, reply_from_interface, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmp_rfi), 0,
 	"ICMP reply from incoming interface for non-local packets");
 /* Router requirements RFC 1812 section 4.3.2.3 requires 576 - 28. */
 VNET_DEFINE_STATIC(int, icmp_quotelen) = 548;
 #define	V_icmp_quotelen			VNET(icmp_quotelen)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, quotelen, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmp_quotelen), 0,
 	"Number of bytes from original packet to quote in ICMP reply");
 
 VNET_DEFINE_STATIC(int, icmpbmcastecho) = 0;
 #define	V_icmpbmcastecho		VNET(icmpbmcastecho)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, bmcastecho, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmpbmcastecho), 0,
 	"Reply to multicast ICMP Echo Request and Timestamp packets");
 
 VNET_DEFINE_STATIC(int, icmptstamprepl) = 1;
 #define	V_icmptstamprepl		VNET(icmptstamprepl)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, tstamprepl, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmptstamprepl), 0,
 	"Respond to ICMP Timestamp packets");
 
 VNET_DEFINE_STATIC(int, error_keeptags) = 0;
 #define	V_error_keeptags		VNET(error_keeptags)
 SYSCTL_INT(_net_inet_icmp, OID_AUTO, error_keeptags, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(error_keeptags), 0,
 	"ICMP error response keeps copy of mbuf_tags of original packet");
 
 #ifdef ICMPPRINTFS
 int	icmpprintfs = 0;
 #endif
 
 static void	icmp_reflect(struct mbuf *);
 static void	icmp_send(struct mbuf *, struct mbuf *);
 static int	icmp_verify_redirect_gateway(struct sockaddr_in *,
     struct sockaddr_in *, struct sockaddr_in *, u_int);
 
 /*
  * Kernel module interface for updating icmpstat.  The argument is an index
  * into icmpstat treated as an array of u_long.  While this encodes the
  * general layout of icmpstat into the caller, it doesn't encode its
  * location, so that future changes to add, for example, per-CPU stats
  * support won't cause binary compatibility problems for kernel modules.
  */
 void
 kmod_icmpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(icmpstat)[statnum], 1);
 }
 
 /*
  * Generate an error packet of type error
  * in response to bad packet ip.
  */
 void
 icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu)
 {
 	struct ip *oip, *nip;
 	struct icmp *icp;
 	struct mbuf *m;
 	unsigned icmplen, icmpelen, nlen, oiphlen;
 
 	KASSERT((u_int)type <= ICMP_MAXTYPE, ("%s: illegal ICMP type",
 	    __func__));
 
 	if (type != ICMP_REDIRECT)
 		ICMPSTAT_INC(icps_error);
 	/*
 	 * Don't send error:
 	 *  if the original packet was encrypted.
 	 *  if not the first fragment of message.
 	 *  in response to a multicast or broadcast packet.
 	 *  if the old packet protocol was an ICMP error message.
 	 */
 	if (n->m_flags & M_DECRYPTED)
 		goto freeit;
 	if (n->m_flags & (M_BCAST|M_MCAST))
 		goto freeit;
 
 	/* Drop if IP header plus 8 bytes is not contiguous in first mbuf. */
 	if (n->m_len < sizeof(struct ip) + ICMP_MINLEN)
 		goto freeit;
 	oip = mtod(n, struct ip *);
 	oiphlen = oip->ip_hl << 2;
 	if (n->m_len < oiphlen + ICMP_MINLEN)
 		goto freeit;
 #ifdef ICMPPRINTFS
 	if (icmpprintfs)
 		printf("icmp_error(%p, %x, %d)\n", oip, type, code);
 #endif
 	if (oip->ip_off & htons(~(IP_MF|IP_DF)))
 		goto freeit;
 	if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
 	    !ICMP_INFOTYPE(((struct icmp *)((caddr_t)oip +
 		oiphlen))->icmp_type)) {
 		ICMPSTAT_INC(icps_oldicmp);
 		goto freeit;
 	}
 	/*
 	 * Calculate length to quote from original packet and
 	 * prevent the ICMP mbuf from overflowing.
 	 * Unfortunately this is non-trivial since ip_forward()
 	 * sends us truncated packets.
 	 */
 	nlen = m_length(n, NULL);
 	if (oip->ip_p == IPPROTO_TCP) {
 		struct tcphdr *th;
 		int tcphlen;
 
 		if (oiphlen + sizeof(struct tcphdr) > n->m_len &&
 		    n->m_next == NULL)
 			goto stdreply;
 		if (n->m_len < oiphlen + sizeof(struct tcphdr) &&
 		    (n = m_pullup(n, oiphlen + sizeof(struct tcphdr))) == NULL)
 			goto freeit;
 		oip = mtod(n, struct ip *);
 		th = mtodo(n, oiphlen);
 		tcphlen = th->th_off << 2;
 		if (tcphlen < sizeof(struct tcphdr))
 			goto freeit;
 		if (ntohs(oip->ip_len) < oiphlen + tcphlen)
 			goto freeit;
 		if (oiphlen + tcphlen > n->m_len && n->m_next == NULL)
 			goto stdreply;
 		if (n->m_len < oiphlen + tcphlen &&
 		    (n = m_pullup(n, oiphlen + tcphlen)) == NULL)
 			goto freeit;
 		oip = mtod(n, struct ip *);
 		icmpelen = max(tcphlen, min(V_icmp_quotelen,
 		    ntohs(oip->ip_len) - oiphlen));
 	} else if (oip->ip_p == IPPROTO_SCTP) {
 		struct sctphdr *sh;
 		struct sctp_chunkhdr *ch;
 
 		if (ntohs(oip->ip_len) < oiphlen + sizeof(struct sctphdr))
 			goto stdreply;
 		if (oiphlen + sizeof(struct sctphdr) > n->m_len &&
 		    n->m_next == NULL)
 			goto stdreply;
 		if (n->m_len < oiphlen + sizeof(struct sctphdr) &&
 		    (n = m_pullup(n, oiphlen + sizeof(struct sctphdr))) == NULL)
 			goto freeit;
 		oip = mtod(n, struct ip *);
 		icmpelen = max(sizeof(struct sctphdr),
 		    min(V_icmp_quotelen, ntohs(oip->ip_len) - oiphlen));
 		sh = mtodo(n, oiphlen);
 		if (ntohl(sh->v_tag) == 0 &&
 		    ntohs(oip->ip_len) >= oiphlen +
 		    sizeof(struct sctphdr) + 8 &&
 		    (n->m_len >= oiphlen + sizeof(struct sctphdr) + 8 ||
 		     n->m_next != NULL)) {
 			if (n->m_len < oiphlen + sizeof(struct sctphdr) + 8 &&
 			    (n = m_pullup(n, oiphlen +
 			    sizeof(struct sctphdr) + 8)) == NULL)
 				goto freeit;
 			oip = mtod(n, struct ip *);
 			sh = mtodo(n, oiphlen);
 			ch = (struct sctp_chunkhdr *)(sh + 1);
 			if (ch->chunk_type == SCTP_INITIATION) {
 				icmpelen = max(sizeof(struct sctphdr) + 8,
 				    min(V_icmp_quotelen, ntohs(oip->ip_len) -
 				    oiphlen));
 			}
 		}
 	} else
 stdreply:	icmpelen = max(8, min(V_icmp_quotelen, ntohs(oip->ip_len) -
 		    oiphlen));
 
 	icmplen = min(oiphlen + icmpelen, nlen);
 	if (icmplen < sizeof(struct ip))
 		goto freeit;
 
 	if (MHLEN > sizeof(struct ip) + ICMP_MINLEN + icmplen)
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	else
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		goto freeit;
 #ifdef MAC
 	mac_netinet_icmp_reply(n, m);
 #endif
 	icmplen = min(icmplen, M_TRAILINGSPACE(m) -
 	    sizeof(struct ip) - ICMP_MINLEN);
 	m_align(m, sizeof(struct ip) + ICMP_MINLEN + icmplen);
 	m->m_data += sizeof(struct ip);
 	m->m_len = ICMP_MINLEN + icmplen;
 
 	/* XXX MRT  make the outgoing packet use the same FIB
 	 * that was associated with the incoming packet
 	 */
 	M_SETFIB(m, M_GETFIB(n));
 	icp = mtod(m, struct icmp *);
 	ICMPSTAT_INC(icps_outhist[type]);
 	icp->icmp_type = type;
 	if (type == ICMP_REDIRECT)
 		icp->icmp_gwaddr.s_addr = dest;
 	else {
 		icp->icmp_void = 0;
 		/*
 		 * The following assignments assume an overlay with the
 		 * just zeroed icmp_void field.
 		 */
 		if (type == ICMP_PARAMPROB) {
 			icp->icmp_pptr = code;
 			code = 0;
 		} else if (type == ICMP_UNREACH &&
 			code == ICMP_UNREACH_NEEDFRAG && mtu) {
 			icp->icmp_nextmtu = htons(mtu);
 		}
 	}
 	icp->icmp_code = code;
 
 	/*
 	 * Copy the quotation into ICMP message and
 	 * convert quoted IP header back to network representation.
 	 */
 	m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip);
 	nip = &icp->icmp_ip;
 
 	/*
 	 * Set up ICMP message mbuf and copy old IP header (without options
 	 * in front of ICMP message.
 	 * If the original mbuf was meant to bypass the firewall, the error
 	 * reply should bypass as well.
 	 */
 	m->m_flags |= n->m_flags & M_SKIP_FIREWALL;
 	KASSERT(M_LEADINGSPACE(m) >= sizeof(struct ip),
 	    ("insufficient space for ip header"));
 	m->m_data -= sizeof(struct ip);
 	m->m_len += sizeof(struct ip);
 	m->m_pkthdr.len = m->m_len;
 	m->m_pkthdr.rcvif = n->m_pkthdr.rcvif;
 	nip = mtod(m, struct ip *);
 	bcopy((caddr_t)oip, (caddr_t)nip, sizeof(struct ip));
 	nip->ip_len = htons(m->m_len);
 	nip->ip_v = IPVERSION;
 	nip->ip_hl = 5;
 	nip->ip_p = IPPROTO_ICMP;
 	nip->ip_tos = 0;
 	nip->ip_off = 0;
 
 	if (V_error_keeptags)
 		m_tag_copy_chain(m, n, M_NOWAIT);
 
 	icmp_reflect(m);
 
 freeit:
 	m_freem(n);
 }
 
 int
 icmp_errmap(const struct icmp *icp)
 {
 
 	switch (icp->icmp_type) {
 	case ICMP_UNREACH:
 		switch (icp->icmp_code) {
 		case ICMP_UNREACH_NET:
 		case ICMP_UNREACH_HOST:
 		case ICMP_UNREACH_SRCFAIL:
 		case ICMP_UNREACH_NET_UNKNOWN:
 		case ICMP_UNREACH_HOST_UNKNOWN:
 		case ICMP_UNREACH_ISOLATED:
 		case ICMP_UNREACH_TOSNET:
 		case ICMP_UNREACH_TOSHOST:
 		case ICMP_UNREACH_HOST_PRECEDENCE:
 		case ICMP_UNREACH_PRECEDENCE_CUTOFF:
 			return (EHOSTUNREACH);
 		case ICMP_UNREACH_NEEDFRAG:
 			return (EMSGSIZE);
 		case ICMP_UNREACH_PROTOCOL:
 		case ICMP_UNREACH_PORT:
 		case ICMP_UNREACH_NET_PROHIB:
 		case ICMP_UNREACH_HOST_PROHIB:
 		case ICMP_UNREACH_FILTER_PROHIB:
 			return (ECONNREFUSED);
 		default:
 			return (0);
 		}
 	case ICMP_TIMXCEED:
 		switch (icp->icmp_code) {
 		case ICMP_TIMXCEED_INTRANS:
 			return (EHOSTUNREACH);
 		default:
 			return (0);
 		}
 	case ICMP_PARAMPROB:
 		switch (icp->icmp_code) {
 		case ICMP_PARAMPROB_ERRATPTR:
 		case ICMP_PARAMPROB_OPTABSENT:
 			return (ENOPROTOOPT);
 		default:
 			return (0);
 		}
 	default:
 		return (0);
 	}
 }
 
 /*
  * Process a received ICMP message.
  */
 int
 icmp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct icmp *icp;
 	struct in_ifaddr *ia;
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct sockaddr_in icmpsrc, icmpdst, icmpgw;
 	int hlen = *offp;
 	int icmplen = ntohs(ip->ip_len) - *offp;
 	int i, code;
 	int fibnum;
 
 	NET_EPOCH_ASSERT();
 
 	*mp = NULL;
 
 	/*
 	 * Locate icmp structure in mbuf, and check
 	 * that not corrupted and of at least minimum length.
 	 */
 #ifdef ICMPPRINTFS
 	if (icmpprintfs) {
 		char srcbuf[INET_ADDRSTRLEN];
 		char dstbuf[INET_ADDRSTRLEN];
 
 		printf("icmp_input from %s to %s, len %d\n",
 		    inet_ntoa_r(ip->ip_src, srcbuf),
 		    inet_ntoa_r(ip->ip_dst, dstbuf), icmplen);
 	}
 #endif
 	if (icmplen < ICMP_MINLEN) {
 		ICMPSTAT_INC(icps_tooshort);
 		goto freeit;
 	}
 	i = hlen + min(icmplen, ICMP_ADVLENMIN);
 	if (m->m_len < i && (m = m_pullup(m, i)) == NULL)  {
 		ICMPSTAT_INC(icps_tooshort);
 		return (IPPROTO_DONE);
 	}
 	ip = mtod(m, struct ip *);
 	m->m_len -= hlen;
 	m->m_data += hlen;
 	icp = mtod(m, struct icmp *);
 	if (in_cksum(m, icmplen)) {
 		ICMPSTAT_INC(icps_checksum);
 		goto freeit;
 	}
 	m->m_len += hlen;
 	m->m_data -= hlen;
 
 #ifdef ICMPPRINTFS
 	if (icmpprintfs)
 		printf("icmp_input, type %d code %d\n", icp->icmp_type,
 		    icp->icmp_code);
 #endif
 
 	/*
 	 * Message type specific processing.
 	 */
 	if (icp->icmp_type > ICMP_MAXTYPE)
 		goto raw;
 
 	/* Initialize */
 	bzero(&icmpsrc, sizeof(icmpsrc));
 	icmpsrc.sin_len = sizeof(struct sockaddr_in);
 	icmpsrc.sin_family = AF_INET;
 	bzero(&icmpdst, sizeof(icmpdst));
 	icmpdst.sin_len = sizeof(struct sockaddr_in);
 	icmpdst.sin_family = AF_INET;
 	bzero(&icmpgw, sizeof(icmpgw));
 	icmpgw.sin_len = sizeof(struct sockaddr_in);
 	icmpgw.sin_family = AF_INET;
 
 	ICMPSTAT_INC(icps_inhist[icp->icmp_type]);
 	code = icp->icmp_code;
 	switch (icp->icmp_type) {
 	case ICMP_UNREACH:
 		if (code > ICMP_UNREACH_PRECEDENCE_CUTOFF)
 			goto badcode;
 		else
 			goto deliver;
 
 	case ICMP_TIMXCEED:
 		if (code > ICMP_TIMXCEED_REASS)
 			goto badcode;
 		else
 			goto deliver;
 
 	case ICMP_PARAMPROB:
 		if (code > ICMP_PARAMPROB_LENGTH)
 			goto badcode;
 
 	deliver:
 		/*
 		 * Problem with datagram; advise higher level routines.
 		 */
 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
 		    icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
 			ICMPSTAT_INC(icps_badlen);
 			goto freeit;
 		}
 		/* Discard ICMP's in response to multicast packets */
 		if (IN_MULTICAST(ntohl(icp->icmp_ip.ip_dst.s_addr)))
 			goto badcode;
 		/* Filter out responses to INADDR_ANY, protocols ignore it. */
 		if (icp->icmp_ip.ip_dst.s_addr == INADDR_ANY)
 			goto freeit;
 #ifdef ICMPPRINTFS
 		if (icmpprintfs)
 			printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
 #endif
 		/*
 		 * XXX if the packet contains [IPv4 AH TCP], we can't make a
 		 * notification to TCP layer.
 		 */
 		i = sizeof(struct ip) + min(icmplen, ICMP_ADVLENPREF(icp));
 		ip_stripoptions(m);
 		if (m->m_len < i && (m = m_pullup(m, i)) == NULL) {
 			/* This should actually not happen */
 			ICMPSTAT_INC(icps_tooshort);
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 		icp = (struct icmp *)(ip + 1);
 		/*
 		 * The upper layer handler can rely on:
 		 * - The outer IP header has no options.
 		 * - The outer IP header, the ICMP header, the inner IP header,
 		 *   and the first n bytes of the inner payload are contiguous.
 		 *   n is at least 8, but might be larger based on
 		 *   ICMP_ADVLENPREF. See its definition in ip_icmp.h.
 		 */
 		if (ip_ctlprotox[icp->icmp_ip.ip_p] != NULL)
 			ip_ctlprotox[icp->icmp_ip.ip_p](icp);
 		break;
 
 	badcode:
 		ICMPSTAT_INC(icps_badcode);
 		break;
 
 	case ICMP_ECHO:
 		if (!V_icmpbmcastecho
 		    && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
 			ICMPSTAT_INC(icps_bmcastecho);
 			break;
 		}
 		if (badport_bandlim(BANDLIM_ICMP_ECHO) < 0)
 			goto freeit;
 		icp->icmp_type = ICMP_ECHOREPLY;
 		goto reflect;
 
 	case ICMP_TSTAMP:
 		if (V_icmptstamprepl == 0)
 			break;
 		if (!V_icmpbmcastecho
 		    && (m->m_flags & (M_MCAST | M_BCAST)) != 0) {
 			ICMPSTAT_INC(icps_bmcasttstamp);
 			break;
 		}
 		if (icmplen < ICMP_TSLEN) {
 			ICMPSTAT_INC(icps_badlen);
 			break;
 		}
 		if (badport_bandlim(BANDLIM_ICMP_TSTAMP) < 0)
 			goto freeit;
 		icp->icmp_type = ICMP_TSTAMPREPLY;
 		icp->icmp_rtime = iptime();
 		icp->icmp_ttime = icp->icmp_rtime;	/* bogus, do later! */
 		goto reflect;
 
 	case ICMP_MASKREQ:
 		if (V_icmpmaskrepl == 0)
 			break;
 		/*
 		 * We are not able to respond with all ones broadcast
 		 * unless we receive it over a point-to-point interface.
 		 */
 		if (icmplen < ICMP_MASKLEN)
 			break;
 		switch (ip->ip_dst.s_addr) {
 		case INADDR_BROADCAST:
 		case INADDR_ANY:
 			icmpdst.sin_addr = ip->ip_src;
 			break;
 
 		default:
 			icmpdst.sin_addr = ip->ip_dst;
 		}
 		ia = (struct in_ifaddr *)ifaof_ifpforaddr(
 			    (struct sockaddr *)&icmpdst, m->m_pkthdr.rcvif);
 		if (ia == NULL)
 			break;
 		if (ia->ia_ifp == NULL)
 			break;
 		icp->icmp_type = ICMP_MASKREPLY;
 		if (V_icmpmaskfake == 0)
 			icp->icmp_mask = ia->ia_sockmask.sin_addr.s_addr;
 		else
 			icp->icmp_mask = V_icmpmaskfake;
 		if (ip->ip_src.s_addr == 0) {
 			if (ia->ia_ifp->if_flags & IFF_BROADCAST)
 			    ip->ip_src = satosin(&ia->ia_broadaddr)->sin_addr;
 			else if (ia->ia_ifp->if_flags & IFF_POINTOPOINT)
 			    ip->ip_src = satosin(&ia->ia_dstaddr)->sin_addr;
 		}
 reflect:
 		ICMPSTAT_INC(icps_reflect);
 		ICMPSTAT_INC(icps_outhist[icp->icmp_type]);
 		icmp_reflect(m);
 		return (IPPROTO_DONE);
 
 	case ICMP_REDIRECT:
 		if (V_log_redirect) {
 			u_long src, dst, gw;
 
 			src = ntohl(ip->ip_src.s_addr);
 			dst = ntohl(icp->icmp_ip.ip_dst.s_addr);
 			gw = ntohl(icp->icmp_gwaddr.s_addr);
 			printf("icmp redirect from %d.%d.%d.%d: "
 			       "%d.%d.%d.%d => %d.%d.%d.%d\n",
 			       (int)(src >> 24), (int)((src >> 16) & 0xff),
 			       (int)((src >> 8) & 0xff), (int)(src & 0xff),
 			       (int)(dst >> 24), (int)((dst >> 16) & 0xff),
 			       (int)((dst >> 8) & 0xff), (int)(dst & 0xff),
 			       (int)(gw >> 24), (int)((gw >> 16) & 0xff),
 			       (int)((gw >> 8) & 0xff), (int)(gw & 0xff));
 		}
 		/*
 		 * RFC1812 says we must ignore ICMP redirects if we
 		 * are acting as router.
 		 */
 		if (V_drop_redirect || V_ipforwarding)
 			break;
 		if (code > 3)
 			goto badcode;
 		if (icmplen < ICMP_ADVLENMIN || icmplen < ICMP_ADVLEN(icp) ||
 		    icp->icmp_ip.ip_hl < (sizeof(struct ip) >> 2)) {
 			ICMPSTAT_INC(icps_badlen);
 			break;
 		}
 		/*
 		 * Short circuit routing redirects to force
 		 * immediate change in the kernel's routing
 		 * tables.  The message is also handed to anyone
 		 * listening on a raw socket (e.g. the routing
 		 * daemon for use in updating its tables).
 		 */
 		icmpgw.sin_addr = ip->ip_src;
 		icmpdst.sin_addr = icp->icmp_gwaddr;
 #ifdef	ICMPPRINTFS
 		if (icmpprintfs) {
 			char dstbuf[INET_ADDRSTRLEN];
 			char gwbuf[INET_ADDRSTRLEN];
 
 			printf("redirect dst %s to %s\n",
 			       inet_ntoa_r(icp->icmp_ip.ip_dst, dstbuf),
 			       inet_ntoa_r(icp->icmp_gwaddr, gwbuf));
 		}
 #endif
 		icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
 
 		/*
 		 * RFC 1122 says network (code 0,2) redirects SHOULD
 		 * be treated identically to the host redirects.
 		 * Given that, ignore network masks.
 		 */
 
 		/*
 		 * Variable values:
 		 * icmpsrc: route destination
 		 * icmpdst: route gateway
 		 * icmpgw: message source
 		 */
 
 		if (icmp_verify_redirect_gateway(&icmpgw, &icmpsrc, &icmpdst,
 		    M_GETFIB(m)) != 0) {
 			/* TODO: increment bad redirects here */
 			break;
 		}
 
 		for ( fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 			rib_add_redirect(fibnum, (struct sockaddr *)&icmpsrc,
 			    (struct sockaddr *)&icmpdst,
 			    (struct sockaddr *)&icmpgw, m->m_pkthdr.rcvif,
 			    RTF_GATEWAY, V_redirtimeout);
 		}
 		break;
 
 	/*
 	 * No kernel processing for the following;
 	 * just fall through to send to raw listener.
 	 */
 	case ICMP_ECHOREPLY:
 	case ICMP_ROUTERADVERT:
 	case ICMP_ROUTERSOLICIT:
 	case ICMP_TSTAMPREPLY:
 	case ICMP_IREQREPLY:
 	case ICMP_MASKREPLY:
 	case ICMP_SOURCEQUENCH:
 	default:
 		break;
 	}
 
 raw:
 	*mp = m;
 	rip_input(mp, offp, proto);
 	return (IPPROTO_DONE);
 
 freeit:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Reflect the ip packet back to the source
  */
 static void
 icmp_reflect(struct mbuf *m)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	struct in_ifaddr *ia;
 	struct in_addr t;
 	struct nhop_object *nh;
 	struct mbuf *opts = NULL;
 	int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
 
 	NET_EPOCH_ASSERT();
 
 	if (IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 	    (IN_EXPERIMENTAL(ntohl(ip->ip_src.s_addr)) && !V_ip_allow_net240) ||
 	    (IN_ZERONET(ntohl(ip->ip_src.s_addr)) && !V_ip_allow_net0) ) {
 		m_freem(m);	/* Bad return address */
 		ICMPSTAT_INC(icps_badaddr);
 		goto done;	/* Ip_output() will check for broadcast */
 	}
 
 	t = ip->ip_dst;
 	ip->ip_dst = ip->ip_src;
 
 	/*
 	 * Source selection for ICMP replies:
 	 *
 	 * If the incoming packet was addressed directly to one of our
 	 * own addresses, use dst as the src for the reply.
 	 */
 	CK_LIST_FOREACH(ia, INADDR_HASH(t.s_addr), ia_hash) {
 		if (t.s_addr == IA_SIN(ia)->sin_addr.s_addr) {
 			t = IA_SIN(ia)->sin_addr;
 			goto match;
 		}
 	}
 
 	/*
 	 * If the incoming packet was addressed to one of our broadcast
 	 * addresses, use the first non-broadcast address which corresponds
 	 * to the incoming interface.
 	 */
 	ifp = m->m_pkthdr.rcvif;
 	if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    t.s_addr) {
 				t = IA_SIN(ia)->sin_addr;
 				goto match;
 			}
 		}
 	}
 	/*
 	 * If the packet was transiting through us, use the address of
 	 * the interface the packet came through in.  If that interface
 	 * doesn't have a suitable IP address, the normal selection
 	 * criteria apply.
 	 */
 	if (V_icmp_rfi && ifp != NULL) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			t = IA_SIN(ia)->sin_addr;
 			goto match;
 		}
 	}
 	/*
 	 * If the incoming packet was not addressed directly to us, use
 	 * designated interface for icmp replies specified by sysctl
 	 * net.inet.icmp.reply_src (default not set). Otherwise continue
 	 * with normal source selection.
 	 */
 	if (V_reply_src[0] != '\0' && (ifp = ifunit(V_reply_src))) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			t = IA_SIN(ia)->sin_addr;
 			goto match;
 		}
 	}
 	/*
 	 * If the packet was transiting through us, use the address of
 	 * the interface that is the closest to the packet source.
 	 * When we don't have a route back to the packet source, stop here
 	 * and drop the packet.
 	 */
 	nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE, 0);
 	if (nh == NULL) {
 		m_freem(m);
 		ICMPSTAT_INC(icps_noroute);
 		goto done;
 	}
 	t = IA_SIN(ifatoia(nh->nh_ifa))->sin_addr;
 match:
 #ifdef MAC
 	mac_netinet_icmp_replyinplace(m);
 #endif
 	ip->ip_src = t;
 	ip->ip_ttl = V_ip_defttl;
 
 	if (optlen > 0) {
 		u_char *cp;
 		int opt, cnt;
 		u_int len;
 
 		/*
 		 * Retrieve any source routing from the incoming packet;
 		 * add on any record-route or timestamp options.
 		 */
 		cp = (u_char *) (ip + 1);
 		if ((opts = ip_srcroute(m)) == NULL &&
 		    (opts = m_gethdr(M_NOWAIT, MT_DATA))) {
 			opts->m_len = sizeof(struct in_addr);
 			mtod(opts, struct in_addr *)->s_addr = 0;
 		}
 		if (opts) {
 #ifdef ICMPPRINTFS
 		    if (icmpprintfs)
 			    printf("icmp_reflect optlen %d rt %d => ",
 				optlen, opts->m_len);
 #endif
 		    for (cnt = optlen; cnt > 0; cnt -= len, cp += len) {
 			    opt = cp[IPOPT_OPTVAL];
 			    if (opt == IPOPT_EOL)
 				    break;
 			    if (opt == IPOPT_NOP)
 				    len = 1;
 			    else {
 				    if (cnt < IPOPT_OLEN + sizeof(*cp))
 					    break;
 				    len = cp[IPOPT_OLEN];
 				    if (len < IPOPT_OLEN + sizeof(*cp) ||
 				        len > cnt)
 					    break;
 			    }
 			    /*
 			     * Should check for overflow, but it "can't happen"
 			     */
 			    if (opt == IPOPT_RR || opt == IPOPT_TS ||
 				opt == IPOPT_SECURITY) {
 				    bcopy((caddr_t)cp,
 					mtod(opts, caddr_t) + opts->m_len, len);
 				    opts->m_len += len;
 			    }
 		    }
 		    /* Terminate & pad, if necessary */
 		    cnt = opts->m_len % 4;
 		    if (cnt) {
 			    for (; cnt < 4; cnt++) {
 				    *(mtod(opts, caddr_t) + opts->m_len) =
 					IPOPT_EOL;
 				    opts->m_len++;
 			    }
 		    }
 #ifdef ICMPPRINTFS
 		    if (icmpprintfs)
 			    printf("%d\n", opts->m_len);
 #endif
 		}
 		ip_stripoptions(m);
 	}
 	m_tag_delete_nonpersistent(m);
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	icmp_send(m, opts);
 done:
 	if (opts)
 		(void)m_free(opts);
 }
 
 /*
  * Verifies if redirect message is valid, according to RFC 1122
  *
  * @src: sockaddr with address of redirect originator
  * @dst: sockaddr with destination in question
  * @gateway: new proposed gateway
  *
  * Returns 0 on success.
  */
 static int
 icmp_verify_redirect_gateway(struct sockaddr_in *src, struct sockaddr_in *dst,
     struct sockaddr_in *gateway, u_int fibnum)
 {
 	struct nhop_object *nh;
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 
 	/* Verify the gateway is directly reachable. */
 	if ((ifa = ifa_ifwithnet((struct sockaddr *)gateway, 0, fibnum))==NULL)
 		return (ENETUNREACH);
 
 	/* TODO: fib-aware. */
 	if (ifa_ifwithaddr_check((struct sockaddr *)gateway))
 		return (EHOSTUNREACH);
 
 	nh = fib4_lookup(fibnum, dst->sin_addr, 0, NHR_NONE, 0);
 	if (nh == NULL)
 		return (EINVAL);
 
 	/*
 	 * If the redirect isn't from our current router for this dst,
 	 * it's either old or wrong.  If it redirects us to ourselves,
 	 * we have a routing loop, perhaps as a result of an interface
 	 * going down recently.
 	 */
 	if (!sa_equal((struct sockaddr *)src, &nh->gw_sa))
 		return (EINVAL);
 	if (nh->nh_ifa != ifa && ifa->ifa_addr->sa_family != AF_LINK)
 		return (EINVAL);
 
 	/* If host route already exists, ignore redirect. */
 	if (nh->nh_flags & NHF_HOST)
 		return (EEXIST);
 
 	/* If the prefix is directly reachable, ignore redirect. */
 	if (!(nh->nh_flags & NHF_GATEWAY))
 		return (EEXIST);
 
 	return (0);
 }
 
 /*
  * Send an icmp packet back to the ip level,
  * after supplying a checksum.
  */
 static void
 icmp_send(struct mbuf *m, struct mbuf *opts)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	int hlen;
 	struct icmp *icp;
 
 	hlen = ip->ip_hl << 2;
 	m->m_data += hlen;
 	m->m_len -= hlen;
 	icp = mtod(m, struct icmp *);
 	icp->icmp_cksum = 0;
 	icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - hlen);
 	m->m_data -= hlen;
 	m->m_len += hlen;
 	m->m_pkthdr.rcvif = (struct ifnet *)0;
 #ifdef ICMPPRINTFS
 	if (icmpprintfs) {
 		char dstbuf[INET_ADDRSTRLEN];
 		char srcbuf[INET_ADDRSTRLEN];
 
 		printf("icmp_send dst %s src %s\n",
 		    inet_ntoa_r(ip->ip_dst, dstbuf),
 		    inet_ntoa_r(ip->ip_src, srcbuf));
 	}
 #endif
 	(void) ip_output(m, opts, NULL, 0, NULL, NULL);
 }
 
 /*
  * Return milliseconds since 00:00 UTC in network format.
  */
 uint32_t
 iptime(void)
 {
 	struct timeval atv;
 	u_long t;
 
 	getmicrotime(&atv);
 	t = (atv.tv_sec % (24*60*60)) * 1000 + atv.tv_usec / 1000;
 	return (htonl(t));
 }
 
 /*
  * Return the next larger or smaller MTU plateau (table from RFC 1191)
  * given current value MTU.  If DIR is less than zero, a larger plateau
  * is returned; otherwise, a smaller value is returned.
  */
 int
 ip_next_mtu(int mtu, int dir)
 {
 	static int mtutab[] = {
 		65535, 32000, 17914, 8166, 4352, 2002, 1492, 1280, 1006, 508,
 		296, 68, 0
 	};
 	int i, size;
 
 	size = (sizeof mtutab) / (sizeof mtutab[0]);
 	if (dir >= 0) {
 		for (i = 0; i < size; i++)
 			if (mtu > mtutab[i])
 				return mtutab[i];
 	} else {
 		for (i = size - 1; i >= 0; i--)
 			if (mtu < mtutab[i])
 				return mtutab[i];
 		if (mtu == mtutab[0])
 			return mtutab[0];
 	}
 	return 0;
 }
 #endif /* INET */
 
 /*
  * badport_bandlim() - check for ICMP bandwidth limit
  *
  *	Return 0 if it is ok to send an ICMP error response, -1 if we have
  *	hit our bandwidth limit and it is not ok.
  *
  *	If icmplim is <= 0, the feature is disabled and 0 is returned.
  *
  *	For now we separate the TCP and UDP subsystems w/ different 'which'
  *	values.  We may eventually remove this separation (and simplify the
  *	code further).
  *
  *	Note that the printing of the error message is delayed so we can
  *	properly print the icmp error rate that the system was trying to do
  *	(i.e. 22000/100 pps, etc...).  This can cause long delays in printing
  *	the 'final' error, but it doesn't make sense to solve the printing
  *	delay with more complex code.
  */
 struct icmp_rate {
 	const char *descr;
 	struct counter_rate cr;
 };
 VNET_DEFINE_STATIC(struct icmp_rate, icmp_rates[BANDLIM_MAX]) = {
 	{ "icmp unreach response" },
 	{ "icmp ping response" },
 	{ "icmp tstamp response" },
 	{ "closed port RST response" },
 	{ "open port RST response" },
 	{ "icmp6 unreach response" },
 	{ "sctp ootb response" }
 };
 #define	V_icmp_rates	VNET(icmp_rates)
 
 static void
 icmp_bandlimit_init(void)
 {
 
 	for (int i = 0; i < BANDLIM_MAX; i++) {
 		V_icmp_rates[i].cr.cr_rate = counter_u64_alloc(M_WAITOK);
 		V_icmp_rates[i].cr.cr_ticks = ticks;
 	}
 }
 VNET_SYSINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY,
     icmp_bandlimit_init, NULL);
 
 static void
 icmp_bandlimit_uninit(void)
 {
 
 	for (int i = 0; i < BANDLIM_MAX; i++)
 		counter_u64_free(V_icmp_rates[i].cr.cr_rate);
 }
 VNET_SYSUNINIT(icmp_bandlimit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     icmp_bandlimit_uninit, NULL);
 
 int
 badport_bandlim(int which)
 {
 	int64_t pps;
 
 	if (V_icmplim == 0 || which == BANDLIM_UNLIMITED)
 		return (0);
 
 	KASSERT(which >= 0 && which < BANDLIM_MAX,
 	    ("%s: which %d", __func__, which));
 
 	if ((V_icmplim + V_icmplim_curr_jitter) <= 0)
 		V_icmplim_curr_jitter = -V_icmplim + 1;
 
 	pps = counter_ratecheck(&V_icmp_rates[which].cr, V_icmplim +
 	    V_icmplim_curr_jitter);
 	if (pps > 0) {
 		/*
 		 * Adjust limit +/- to jitter the measurement to deny a
 		 * side-channel port scan as in CVE-2020-25705
 		 */
 		if (V_icmplim_jitter > 0) {
 			int32_t inc =
 			    arc4random_uniform(V_icmplim_jitter * 2 +1)
 			    - V_icmplim_jitter;
 
 			V_icmplim_curr_jitter = inc;
 		}
 	}
 	if (pps == -1)
 		return (-1);
 	if (pps > 0 && V_icmplim_output)
 		log(LOG_NOTICE, "Limiting %s from %jd to %d packets/sec\n",
 		    V_icmp_rates[which].descr, (intmax_t )pps, V_icmplim +
 		    V_icmplim_curr_jitter);
 	return (0);
 }
diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c
index b33d1e1b6697..b8fb3861c5b8 100644
--- a/sys/netinet/ip_input.c
+++ b/sys/netinet/ip_input.c
@@ -1,1366 +1,1367 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bootp.h"
 #include "opt_inet.h"
 #include "opt_ipstealth.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/hhook.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
+#include <net/if_private.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/igmp_var.h>
 #include <netinet/ip_options.h>
 #include <machine/in_cksum.h>
 #include <netinet/ip_carp.h>
 #include <netinet/in_rss.h>
 #ifdef SCTP
 #include <netinet/sctp_var.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <sys/socketvar.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef CTASSERT
 CTASSERT(sizeof(struct ip) == 20);
 #endif
 
 /* IP reassembly functions are defined in ip_reass.c. */
 extern void ipreass_init(void);
 extern void ipreass_vnet_init(void);
 #ifdef VIMAGE
 extern void ipreass_destroy(void);
 #endif
 
 VNET_DEFINE(int, rsvp_on);
 
 VNET_DEFINE(int, ipforwarding);
 SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipforwarding), 0,
     "Enable IP forwarding between interfaces");
 
 /*
  * Respond with an ICMP host redirect when we forward a packet out of
  * the same interface on which it was received.  See RFC 792.
  */
 VNET_DEFINE(int, ipsendredirects) = 1;
 SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipsendredirects), 0,
     "Enable sending IP redirects");
 
 VNET_DEFINE_STATIC(bool, ip_strong_es) = false;
 #define	V_ip_strong_es	VNET(ip_strong_es)
 SYSCTL_BOOL(_net_inet_ip, OID_AUTO, rfc1122_strong_es,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_strong_es), false,
     "Packet's IP destination address must match address on arrival interface");
 
 VNET_DEFINE_STATIC(bool, ip_sav) = true;
 #define	V_ip_sav	VNET(ip_sav)
 SYSCTL_BOOL(_net_inet_ip, OID_AUTO, source_address_validation,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_sav), true,
     "Drop incoming packets with source address that is a local address");
 
 VNET_DEFINE(pfil_head_t, inet_pfil_head);	/* Packet filter hooks */
 
 static struct netisr_handler ip_nh = {
 	.nh_name = "ip",
 	.nh_handler = ip_input,
 	.nh_proto = NETISR_IP,
 #ifdef	RSS
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 #else
 	.nh_policy = NETISR_POLICY_FLOW,
 #endif
 };
 
 #ifdef	RSS
 /*
  * Directly dispatched frames are currently assumed
  * to have a flowid already calculated.
  *
  * It should likely have something that assert it
  * actually has valid flow details.
  */
 static struct netisr_handler ip_direct_nh = {
 	.nh_name = "ip_direct",
 	.nh_handler = ip_direct_input,
 	.nh_proto = NETISR_IP_DIRECT,
 	.nh_m2cpuid = rss_soft_m2cpuid_v4,
 	.nh_policy = NETISR_POLICY_CPU,
 	.nh_dispatch = NETISR_DISPATCH_HYBRID,
 };
 #endif
 
 ipproto_input_t		*ip_protox[IPPROTO_MAX] = {
 			    [0 ... IPPROTO_MAX - 1] = rip_input };
 ipproto_ctlinput_t	*ip_ctlprotox[IPPROTO_MAX] = {
 			    [0 ... IPPROTO_MAX - 1] = rip_ctlinput };
 
 VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead);  /* first inet address */
 VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table  */
 VNET_DEFINE(u_long, in_ifaddrhmask);		/* mask for hash table */
 
 /* Make sure it is safe to use hashinit(9) on CK_LIST. */
 CTASSERT(sizeof(struct in_ifaddrhashhead) == sizeof(LIST_HEAD(, in_addr)));
 
 #ifdef IPCTL_DEFMTU
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW,
     &ip_mtu, 0, "Default MTU");
 #endif
 
 #ifdef IPSTEALTH
 VNET_DEFINE(int, ipstealth);
 SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ipstealth), 0,
     "IP stealth mode, no TTL decrementation on forwarding");
 #endif
 
 /*
  * IP statistics are stored in the "array" of counter(9)s.
  */
 VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat);
 VNET_PCPUSTAT_SYSINIT(ipstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat,
     "IP statistics (struct ipstat, netinet/ip_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(ipstat);
 #endif /* VIMAGE */
 
 /*
  * Kernel module interface for updating ipstat.  The argument is an index
  * into ipstat treated as an array.
  */
 void
 kmod_ipstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], 1);
 }
 
 void
 kmod_ipstat_dec(int statnum)
 {
 
 	counter_u64_add(VNET(ipstat)[statnum], -1);
 }
 
 static int
 sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
     sysctl_netinet_intr_queue_maxlen, "I",
     "Maximum size of the IP input queue");
 
 static int
 sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
     0, 0, sysctl_netinet_intr_queue_drops, "I",
     "Number of packets dropped from the IP input queue");
 
 #ifdef	RSS
 static int
 sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&ip_direct_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&ip_direct_nh, qlimit));
 }
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQMAXLEN, intr_direct_queue_maxlen,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_netinet_intr_direct_queue_maxlen,
     "I", "Maximum size of the IP direct input queue");
 
 static int
 sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS)
 {
 	u_int64_t qdrops_long;
 	int error, qdrops;
 
 	netisr_getqdrops(&ip_direct_nh, &qdrops_long);
 	qdrops = qdrops_long;
 	error = sysctl_handle_int(oidp, &qdrops, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (qdrops != 0)
 		return (EINVAL);
 	netisr_clearqdrops(&ip_direct_nh);
 	return (0);
 }
 
 SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQDROPS, intr_direct_queue_drops,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_netinet_intr_direct_queue_drops, "I",
     "Number of packets dropped from the IP direct input queue");
 #endif	/* RSS */
 
 /*
  * IP initialization: fill in IP protocol switch table.
  * All protocols not implemented in kernel go to raw IP protocol handler.
  */
 static void
 ip_vnet_init(void *arg __unused)
 {
 	struct pfil_head_args args;
 
 	CK_STAILQ_INIT(&V_in_ifaddrhead);
 	V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask);
 
 	/* Initialize IP reassembly queue. */
 	ipreass_vnet_init();
 
 	/* Initialize packet filter hooks. */
 	args.pa_version = PFIL_VERSION;
 	args.pa_flags = PFIL_IN | PFIL_OUT;
 	args.pa_type = PFIL_TYPE_IP4;
 	args.pa_headname = PFIL_INET_NAME;
 	V_inet_pfil_head = pfil_head_register(&args);
 
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET,
 	    &V_ipsec_hhh_in[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register input helper hook\n",
 		    __func__);
 	if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET,
 	    &V_ipsec_hhh_out[HHOOK_IPSEC_INET],
 	    HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register output helper hook\n",
 		    __func__);
 
 #ifdef VIMAGE
 	netisr_register_vnet(&ip_nh);
 #ifdef	RSS
 	netisr_register_vnet(&ip_direct_nh);
 #endif
 #endif
 }
 VNET_SYSINIT(ip_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
     ip_vnet_init, NULL);
 
 static void
 ip_init(const void *unused __unused)
 {
 
 	ipreass_init();
 
 	/*
 	 * Register statically compiled protocols, that are unlikely to
 	 * ever become dynamic.
 	 */
 	IPPROTO_REGISTER(IPPROTO_ICMP, icmp_input, NULL);
 	IPPROTO_REGISTER(IPPROTO_IGMP, igmp_input, NULL);
 	IPPROTO_REGISTER(IPPROTO_RSVP, rsvp_input, NULL);
 	IPPROTO_REGISTER(IPPROTO_IPV4, encap4_input, NULL);
 	IPPROTO_REGISTER(IPPROTO_MOBILE, encap4_input, NULL);
 	IPPROTO_REGISTER(IPPROTO_ETHERIP, encap4_input, NULL);
 	IPPROTO_REGISTER(IPPROTO_GRE, encap4_input, NULL);
 	IPPROTO_REGISTER(IPPROTO_IPV6, encap4_input, NULL);
 	IPPROTO_REGISTER(IPPROTO_PIM, encap4_input, NULL);
 #ifdef SCTP	/* XXX: has a loadable & static version */
 	IPPROTO_REGISTER(IPPROTO_SCTP, sctp_input, sctp_ctlinput);
 #endif
 
 	netisr_register(&ip_nh);
 #ifdef	RSS
 	netisr_register(&ip_direct_nh);
 #endif
 }
 SYSINIT(ip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_init, NULL);
 
 #ifdef VIMAGE
 static void
 ip_destroy(void *unused __unused)
 {
 	int error;
 
 #ifdef	RSS
 	netisr_unregister_vnet(&ip_direct_nh);
 #endif
 	netisr_unregister_vnet(&ip_nh);
 
 	pfil_head_unregister(V_inet_pfil_head);
 	error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister input helper hook "
 		    "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
 	error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister output helper hook "
 		    "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: "
 		    "error %d returned\n", __func__, error);
 	}
 
 	/* Remove the IPv4 addresses from all interfaces. */
 	in_ifscrub_all();
 
 	/* Make sure the IPv4 routes are gone as well. */
 	rib_flush_routes_family(AF_INET);
 
 	/* Destroy IP reassembly queue. */
 	ipreass_destroy();
 
 	/* Cleanup in_ifaddr hash table; should be empty. */
 	hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask);
 }
 
 VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL);
 #endif
 
 #ifdef	RSS
 /*
  * IP direct input routine.
  *
  * This is called when reinjecting completed fragments where
  * all of the previous checking and book-keeping has been done.
  */
 void
 ip_direct_input(struct mbuf *m)
 {
 	struct ip *ip;
 	int hlen;
 
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (IPSEC_ENABLED(ipv4)) {
 		if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0)
 			return;
 	}
 #endif /* IPSEC */
 	IPSTAT_INC(ips_delivered);
 	ip_protox[ip->ip_p](&m, &hlen, ip->ip_p);
 }
 #endif
 
 /*
  * Ip input routine.  Checksum and byte swap header.  If fragmented
  * try to reassemble.  Process options.  Pass to next level.
  */
 void
 ip_input(struct mbuf *m)
 {
 	struct ip *ip = NULL;
 	struct in_ifaddr *ia = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	int hlen = 0;
 	uint16_t sum, ip_len;
 	int dchg = 0;				/* dest changed after fw */
 	struct in_addr odst;			/* original dst address */
 	bool strong_es;
 
 	M_ASSERTPKTHDR(m);
 	NET_EPOCH_ASSERT();
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		/* Set up some basics that will be used later. */
 		ip = mtod(m, struct ip *);
 		hlen = ip->ip_hl << 2;
 		ip_len = ntohs(ip->ip_len);
 		goto ours;
 	}
 
 	IPSTAT_INC(ips_total);
 
 	if (__predict_false(m->m_pkthdr.len < sizeof(struct ip)))
 		goto tooshort;
 
 	if (m->m_len < sizeof(struct ip)) {
 		m = m_pullup(m, sizeof(struct ip));
 		if (__predict_false(m == NULL)) {
 			IPSTAT_INC(ips_toosmall);
 			return;
 		}
 	}
 	ip = mtod(m, struct ip *);
 
 	if (__predict_false(ip->ip_v != IPVERSION)) {
 		IPSTAT_INC(ips_badvers);
 		goto bad;
 	}
 
 	hlen = ip->ip_hl << 2;
 	if (__predict_false(hlen < sizeof(struct ip))) {	/* minimum header length */
 		IPSTAT_INC(ips_badhlen);
 		goto bad;
 	}
 	if (hlen > m->m_len) {
 		m = m_pullup(m, hlen);
 		if (__predict_false(m == NULL)) {
 			IPSTAT_INC(ips_badhlen);
 			return;
 		}
 		ip = mtod(m, struct ip *);
 	}
 
 	IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL);
 
 	/* IN_LOOPBACK must not appear on the wire - RFC1122 */
 	ifp = m->m_pkthdr.rcvif;
 	if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) ||
 	    IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			goto bad;
 		}
 	}
 
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
 	} else {
 		if (hlen == sizeof(struct ip)) {
 			sum = in_cksum_hdr(ip);
 		} else {
 			sum = in_cksum(m, hlen);
 		}
 	}
 	if (__predict_false(sum)) {
 		IPSTAT_INC(ips_badsum);
 		goto bad;
 	}
 
 	ip_len = ntohs(ip->ip_len);
 	if (__predict_false(ip_len < hlen)) {
 		IPSTAT_INC(ips_badlen);
 		goto bad;
 	}
 
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IP header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	if (__predict_false(m->m_pkthdr.len < ip_len)) {
 tooshort:
 		IPSTAT_INC(ips_tooshort);
 		goto bad;
 	}
 	if (m->m_pkthdr.len > ip_len) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = ip_len;
 			m->m_pkthdr.len = ip_len;
 		} else
 			m_adj(m, ip_len - m->m_pkthdr.len);
 	}
 
 	/*
 	 * Try to forward the packet, but if we fail continue.
 	 * ip_tryforward() may generate redirects these days.
 	 * XXX the logic below falling through to normal processing
 	 * if redirects are required should be revisited as well.
 	 * ip_tryforward() does inbound and outbound packet firewall
 	 * processing. If firewall has decided that destination becomes
 	 * our local address, it sets M_FASTFWD_OURS flag. In this
 	 * case skip another inbound firewall processing and update
 	 * ip pointer.
 	 */
 	if (V_ipforwarding != 0
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	    && (!IPSEC_ENABLED(ipv4) ||
 	    IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0)
 #endif
 	    ) {
 		/*
 		 * ip_dooptions() was run so we can ignore the source route (or
 		 * any IP options case) case for redirects in ip_tryforward().
 		 */
 		if ((m = ip_tryforward(m)) == NULL)
 			return;
 		if (m->m_flags & M_FASTFWD_OURS) {
 			m->m_flags &= ~M_FASTFWD_OURS;
 			ip = mtod(m, struct ip *);
 			goto ours;
 		}
 	}
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * Bypass packet filtering for packets previously handled by IPsec.
 	 */
 	if (IPSEC_ENABLED(ipv4) &&
 	    IPSEC_CAPS(ipv4, m, IPSEC_CAP_BYPASS_FILTER) != 0)
 			goto passin;
 #endif
 
 	/*
 	 * Run through list of hooks for input packets.
 	 *
 	 * NB: Beware of the destination address changing (e.g.
 	 *     by NAT rewriting).  When this happens, tell
 	 *     ip_forward to do the right thing.
 	 */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED_IN(V_inet_pfil_head))
 		goto passin;
 
 	odst = ip->ip_dst;
 	if (pfil_mbuf_in(V_inet_pfil_head, &m, ifp, NULL) !=
 	    PFIL_PASS)
 		return;
 	if (m == NULL)			/* consumed by filter */
 		return;
 
 	ip = mtod(m, struct ip *);
 	dchg = (odst.s_addr != ip->ip_dst.s_addr);
 
 	if (m->m_flags & M_FASTFWD_OURS) {
 		m->m_flags &= ~M_FASTFWD_OURS;
 		goto ours;
 	}
 	if (m->m_flags & M_IP_NEXTHOP) {
 		if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) {
 			/*
 			 * Directly ship the packet on.  This allows
 			 * forwarding packets originally destined to us
 			 * to some other directly connected host.
 			 */
 			ip_forward(m, 1);
 			return;
 		}
 	}
 passin:
 
 	/*
 	 * Process options and, if not destined for us,
 	 * ship it on.  ip_dooptions returns 1 when an
 	 * error was detected (causing an icmp message
 	 * to be sent and the original packet to be freed).
 	 */
 	if (hlen > sizeof (struct ip) && ip_dooptions(m, 0))
 		return;
 
         /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no
          * matter if it is destined to another node, or whether it is
          * a multicast one, RSVP wants it! and prevents it from being forwarded
          * anywhere else. Also checks if the rsvp daemon is running before
 	 * grabbing the packet.
          */
 	if (ip->ip_p == IPPROTO_RSVP && V_rsvp_on)
 		goto ours;
 
 	/*
 	 * Check our list of addresses, to see if the packet is for us.
 	 * If we don't have any addresses, assume any unicast packet
 	 * we receive might be for us (and let the upper layers deal
 	 * with it).
 	 */
 	if (CK_STAILQ_EMPTY(&V_in_ifaddrhead) &&
 	    (m->m_flags & (M_MCAST|M_BCAST)) == 0)
 		goto ours;
 
 	/*
 	 * Enable a consistency check between the destination address
 	 * and the arrival interface for a unicast packet (the RFC 1122
 	 * strong ES model) with a list of additional predicates:
 	 * - if IP forwarding is disabled
 	 * - the packet is not locally generated
 	 * - the packet is not subject to 'ipfw fwd'
 	 * - Interface is not running CARP. If the packet got here, we already
 	 *   checked it with carp_iamatch() and carp_forus().
 	 */
 	strong_es = V_ip_strong_es && (V_ipforwarding == 0) &&
 	    ((ifp->if_flags & IFF_LOOPBACK) == 0) &&
 	    ifp->if_carp == NULL && (dchg == 0);
 
 	/*
 	 * Check for exact addresses in the hash bucket.
 	 */
 	CK_LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
 		if (IA_SIN(ia)->sin_addr.s_addr != ip->ip_dst.s_addr)
 			continue;
 
 		/*
 		 * net.inet.ip.rfc1122_strong_es: the address matches, verify
 		 * that the packet arrived via the correct interface.
 		 */
 		if (__predict_false(strong_es && ia->ia_ifp != ifp)) {
 			IPSTAT_INC(ips_badaddr);
 			goto bad;
 		}
 
 		/*
 		 * net.inet.ip.source_address_validation: drop incoming
 		 * packets that pretend to be ours.
 		 */
 		if (V_ip_sav && !(ifp->if_flags & IFF_LOOPBACK) &&
 		    __predict_false(in_localip_fib(ip->ip_src, ifp->if_fib))) {
 			IPSTAT_INC(ips_badaddr);
 			goto bad;
 		}
 
 		counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 		counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len);
 		goto ours;
 	}
 
 	/*
 	 * Check for broadcast addresses.
 	 *
 	 * Only accept broadcast packets that arrive via the matching
 	 * interface.  Reception of forwarded directed broadcasts would
 	 * be handled via ip_forward() and ether_output() with the loopback
 	 * into the stack for SIMPLEX interfaces handled by ether_output().
 	 */
 	if (ifp->if_flags & IFF_BROADCAST) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			ia = ifatoia(ifa);
 			if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr ==
 			    ip->ip_dst.s_addr) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				goto ours;
 			}
 #ifdef BOOTP_COMPAT
 			if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) {
 				counter_u64_add(ia->ia_ifa.ifa_ipackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_ibytes,
 				    m->m_pkthdr.len);
 				goto ours;
 			}
 #endif
 		}
 		ia = NULL;
 	}
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		/*
 		 * RFC 3927 2.7: Do not forward multicast packets from
 		 * IN_LINKLOCAL.
 		 */
 		if (V_ip_mrouter && !IN_LINKLOCAL(ntohl(ip->ip_src.s_addr))) {
 			/*
 			 * If we are acting as a multicast router, all
 			 * incoming multicast packets are passed to the
 			 * kernel-level multicast forwarding function.
 			 * The packet is returned (relatively) intact; if
 			 * ip_mforward() returns a non-zero value, the packet
 			 * must be discarded, else it may be accepted below.
 			 */
 			if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) {
 				IPSTAT_INC(ips_cantforward);
 				m_freem(m);
 				return;
 			}
 
 			/*
 			 * The process-level routing daemon needs to receive
 			 * all multicast IGMP packets, whether or not this
 			 * host belongs to their destination groups.
 			 */
 			if (ip->ip_p == IPPROTO_IGMP) {
 				goto ours;
 			}
 			IPSTAT_INC(ips_forward);
 		}
 		/*
 		 * Assume the packet is for us, to avoid prematurely taking
 		 * a lock on the in_multi hash. Protocols must perform
 		 * their own filtering and update statistics accordingly.
 		 */
 		goto ours;
 	}
 	if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST)
 		goto ours;
 	if (ip->ip_dst.s_addr == INADDR_ANY)
 		goto ours;
 	/* RFC 3927 2.7: Do not forward packets to or from IN_LINKLOCAL. */
 	if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) ||
 	    IN_LINKLOCAL(ntohl(ip->ip_src.s_addr))) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 
 	/*
 	 * Not for us; forward if possible and desirable.
 	 */
 	if (V_ipforwarding == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 	} else {
 		ip_forward(m, dchg);
 	}
 	return;
 
 ours:
 #ifdef IPSTEALTH
 	/*
 	 * IPSTEALTH: Process non-routing options only
 	 * if the packet is destined for us.
 	 */
 	if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1))
 		return;
 #endif /* IPSTEALTH */
 
 	/*
 	 * Attempt reassembly; if it succeeds, proceed.
 	 * ip_reass() will return a different mbuf.
 	 */
 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) {
 		/* XXXGL: shouldn't we save & set m_flags? */
 		m = ip_reass(m);
 		if (m == NULL)
 			return;
 		ip = mtod(m, struct ip *);
 		/* Get the header length of the reassembled packet */
 		hlen = ip->ip_hl << 2;
 	}
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (IPSEC_ENABLED(ipv4)) {
 		if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0)
 			return;
 	}
 #endif /* IPSEC */
 
 	/*
 	 * Switch out to protocol's input routine.
 	 */
 	IPSTAT_INC(ips_delivered);
 
 	ip_protox[ip->ip_p](&m, &hlen, ip->ip_p);
 	return;
 bad:
 	m_freem(m);
 }
 
 int
 ipproto_register(uint8_t proto, ipproto_input_t input, ipproto_ctlinput_t ctl)
 {
 
 	MPASS(proto > 0);
 
 	/*
 	 * The protocol slot must not be occupied by another protocol
 	 * already.  An index pointing to rip_input() is unused.
 	 */
 	if (ip_protox[proto] == rip_input) {
 		ip_protox[proto] = input;
 		ip_ctlprotox[proto] = ctl;
 		return (0);
 	} else
 		return (EEXIST);
 }
 
 int
 ipproto_unregister(uint8_t proto)
 {
 
 	MPASS(proto > 0);
 
 	if (ip_protox[proto] != rip_input) {
 		ip_protox[proto] = rip_input;
 		ip_ctlprotox[proto] = rip_ctlinput;
 		return (0);
 	} else
 		return (ENOENT);
 }
 
 /*
  * Forward a packet.  If some error occurs return the sender
  * an icmp packet.  Note we can't always generate a meaningful
  * icmp message because icmp doesn't have a large enough repertoire
  * of codes and types.
  *
  * If not forwarding, just drop the packet.  This could be confusing
  * if ipforwarding was zero but some routing protocol was advancing
  * us as a gateway to somewhere.  However, we must let the routing
  * protocol deal with that.
  *
  * The srcrt parameter indicates whether the packet is being forwarded
  * via a source route.
  */
 void
 ip_forward(struct mbuf *m, int srcrt)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct in_ifaddr *ia;
 	struct mbuf *mcopy;
 	struct sockaddr_in *sin;
 	struct in_addr dest;
 	struct route ro;
 	uint32_t flowid;
 	int error, type = 0, code = 0, mtu = 0;
 
 	NET_EPOCH_ASSERT();
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
 		IPSTAT_INC(ips_cantforward);
 		m_freem(m);
 		return;
 	}
 	if (
 #ifdef IPSTEALTH
 	    V_ipstealth == 0 &&
 #endif
 	    ip->ip_ttl <= IPTTLDEC) {
 		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0);
 		return;
 	}
 
 	bzero(&ro, sizeof(ro));
 	sin = (struct sockaddr_in *)&ro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = ip->ip_dst;
 	flowid = m->m_pkthdr.flowid;
 	ro.ro_nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_REF, flowid);
 	if (ro.ro_nh != NULL) {
 		ia = ifatoia(ro.ro_nh->nh_ifa);
 	} else
 		ia = NULL;
 	/*
 	 * Save the IP header and at most 8 bytes of the payload,
 	 * in case we need to generate an ICMP message to the src.
 	 *
 	 * XXX this can be optimized a lot by saving the data in a local
 	 * buffer on the stack (72 bytes at most), and only allocating the
 	 * mbuf if really necessary. The vast majority of the packets
 	 * are forwarded without having to send an ICMP back (either
 	 * because unnecessary, or because rate limited), so we are
 	 * really we are wasting a lot of work here.
 	 *
 	 * We don't use m_copym() because it might return a reference
 	 * to a shared cluster. Both this function and ip_output()
 	 * assume exclusive access to the IP header in `m', so any
 	 * data in a cluster may change before we reach icmp_error().
 	 */
 	mcopy = m_gethdr(M_NOWAIT, m->m_type);
 	if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) {
 		/*
 		 * It's probably ok if the pkthdr dup fails (because
 		 * the deep copy of the tag chain failed), but for now
 		 * be conservative and just discard the copy since
 		 * code below may some day want the tags.
 		 */
 		m_free(mcopy);
 		mcopy = NULL;
 	}
 	if (mcopy != NULL) {
 		mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy));
 		mcopy->m_pkthdr.len = mcopy->m_len;
 		m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t));
 	}
 #ifdef IPSTEALTH
 	if (V_ipstealth == 0)
 #endif
 		ip->ip_ttl -= IPTTLDEC;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (IPSEC_ENABLED(ipv4)) {
 		if ((error = IPSEC_FORWARD(ipv4, m)) != 0) {
 			/* mbuf consumed by IPsec */
 			RO_NHFREE(&ro);
 			m_freem(mcopy);
 			if (error != EINPROGRESS)
 				IPSTAT_INC(ips_cantforward);
 			return;
 		}
 		/* No IPsec processing required */
 	}
 #endif /* IPSEC */
 	/*
 	 * If forwarding packet using same interface that it came in on,
 	 * perhaps should send a redirect to sender to shortcut a hop.
 	 * Only send redirect if source is sending directly to us,
 	 * and if packet was not source routed (or has any options).
 	 * Also, don't send redirect if forwarding using a default route
 	 * or a route modified by a redirect.
 	 */
 	dest.s_addr = 0;
 	if (!srcrt && V_ipsendredirects &&
 	    ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) {
 		struct nhop_object *nh;
 
 		nh = ro.ro_nh;
 
 		if (nh != NULL && ((nh->nh_flags & (NHF_REDIRECT|NHF_DEFAULT)) == 0)) {
 			struct in_ifaddr *nh_ia = (struct in_ifaddr *)(nh->nh_ifa);
 			u_long src = ntohl(ip->ip_src.s_addr);
 
 			if (nh_ia != NULL &&
 			    (src & nh_ia->ia_subnetmask) == nh_ia->ia_subnet) {
 				/* Router requirements says to only send host redirects */
 				type = ICMP_REDIRECT;
 				code = ICMP_REDIRECT_HOST;
 				if (nh->nh_flags & NHF_GATEWAY) {
 				    if (nh->gw_sa.sa_family == AF_INET)
 					dest.s_addr = nh->gw4_sa.sin_addr.s_addr;
 				    else /* Do not redirect in case gw is AF_INET6 */
 					type = 0;
 				} else
 					dest.s_addr = ip->ip_dst.s_addr;
 			}
 		}
 	}
 
 	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
 
 	if (error == EMSGSIZE && ro.ro_nh)
 		mtu = ro.ro_nh->nh_mtu;
 	RO_NHFREE(&ro);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
 	else {
 		IPSTAT_INC(ips_forward);
 		if (type)
 			IPSTAT_INC(ips_redirectsent);
 		else {
 			if (mcopy)
 				m_freem(mcopy);
 			return;
 		}
 	}
 	if (mcopy == NULL)
 		return;
 
 	switch (error) {
 	case 0:				/* forwarded, but need redirect */
 		/* type, code set above */
 		break;
 
 	case ENETUNREACH:
 	case EHOSTUNREACH:
 	case ENETDOWN:
 	case EHOSTDOWN:
 	default:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_HOST;
 		break;
 
 	case EMSGSIZE:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
 		/*
 		 * If the MTU was set before make sure we are below the
 		 * interface MTU.
 		 * If the MTU wasn't set before use the interface mtu or
 		 * fall back to the next smaller mtu step compared to the
 		 * current packet size.
 		 */
 		if (mtu != 0) {
 			if (ia != NULL)
 				mtu = min(mtu, ia->ia_ifp->if_mtu);
 		} else {
 			if (ia != NULL)
 				mtu = ia->ia_ifp->if_mtu;
 			else
 				mtu = ip_next_mtu(ntohs(ip->ip_len), 0);
 		}
 		IPSTAT_INC(ips_cantfrag);
 		break;
 
 	case ENOBUFS:
 	case EACCES:			/* ipfw denied packet */
 		m_freem(mcopy);
 		return;
 	}
 	icmp_error(mcopy, type, code, dest.s_addr, mtu);
 }
 
 #define	CHECK_SO_CT(sp, ct) \
     (((sp->so_options & SO_TIMESTAMP) && (sp->so_ts_clock == ct)) ? 1 : 0)
 
 void
 ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
     struct mbuf *m)
 {
 	bool stamped;
 
 	stamped = false;
 	if ((inp->inp_socket->so_options & SO_BINTIME) ||
 	    CHECK_SO_CT(inp->inp_socket, SO_TS_BINTIME)) {
 		struct bintime boottimebin, bt;
 		struct timespec ts1;
 
 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 		    M_TSTMP)) {
 			mbuf_tstmp2timespec(m, &ts1);
 			timespec2bintime(&ts1, &bt);
 			getboottimebin(&boottimebin);
 			bintime_add(&bt, &boottimebin);
 		} else {
 			bintime(&bt);
 		}
 		*mp = sbcreatecontrol(&bt, sizeof(bt), SCM_BINTIME,
 		    SOL_SOCKET, M_NOWAIT);
 		if (*mp != NULL) {
 			mp = &(*mp)->m_next;
 			stamped = true;
 		}
 	}
 	if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME_MICRO)) {
 		struct bintime boottimebin, bt1;
 		struct timespec ts1;
 		struct timeval tv;
 
 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 		    M_TSTMP)) {
 			mbuf_tstmp2timespec(m, &ts1);
 			timespec2bintime(&ts1, &bt1);
 			getboottimebin(&boottimebin);
 			bintime_add(&bt1, &boottimebin);
 			bintime2timeval(&bt1, &tv);
 		} else {
 			microtime(&tv);
 		}
 		*mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP,
 		    SOL_SOCKET, M_NOWAIT);
 		if (*mp != NULL) {
 			mp = &(*mp)->m_next;
 			stamped = true;
 		}
 	} else if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME)) {
 		struct bintime boottimebin;
 		struct timespec ts, ts1;
 
 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 		    M_TSTMP)) {
 			mbuf_tstmp2timespec(m, &ts);
 			getboottimebin(&boottimebin);
 			bintime2timespec(&boottimebin, &ts1);
 			timespecadd(&ts, &ts1, &ts);
 		} else {
 			nanotime(&ts);
 		}
 		*mp = sbcreatecontrol(&ts, sizeof(ts), SCM_REALTIME,
 		    SOL_SOCKET, M_NOWAIT);
 		if (*mp != NULL) {
 			mp = &(*mp)->m_next;
 			stamped = true;
 		}
 	} else if (CHECK_SO_CT(inp->inp_socket, SO_TS_MONOTONIC)) {
 		struct timespec ts;
 
 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 		    M_TSTMP))
 			mbuf_tstmp2timespec(m, &ts);
 		else
 			nanouptime(&ts);
 		*mp = sbcreatecontrol(&ts, sizeof(ts), SCM_MONOTONIC,
 		    SOL_SOCKET, M_NOWAIT);
 		if (*mp != NULL) {
 			mp = &(*mp)->m_next;
 			stamped = true;
 		}
 	}
 	if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR |
 	    M_TSTMP)) {
 		struct sock_timestamp_info sti;
 
 		bzero(&sti, sizeof(sti));
 		sti.st_info_flags = ST_INFO_HW;
 		if ((m->m_flags & M_TSTMP_HPREC) != 0)
 			sti.st_info_flags |= ST_INFO_HW_HPREC;
 		*mp = sbcreatecontrol(&sti, sizeof(sti), SCM_TIME_INFO,
 		    SOL_SOCKET, M_NOWAIT);
 		if (*mp != NULL)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVDSTADDR) {
 		*mp = sbcreatecontrol(&ip->ip_dst, sizeof(struct in_addr),
 		    IP_RECVDSTADDR, IPPROTO_IP, M_NOWAIT);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTTL) {
 		*mp = sbcreatecontrol(&ip->ip_ttl, sizeof(u_char), IP_RECVTTL,
 		    IPPROTO_IP, M_NOWAIT);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #ifdef notyet
 	/* XXX
 	 * Moving these out of udp_input() made them even more broken
 	 * than they already were.
 	 */
 	/* options were tossed already */
 	if (inp->inp_flags & INP_RECVOPTS) {
 		*mp = sbcreatecontrol(opts_deleted_above,
 		    sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP, M_NOWAIT);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	/* ip_srcroute doesn't do what we want here, need to fix */
 	if (inp->inp_flags & INP_RECVRETOPTS) {
 		*mp = sbcreatecontrol(ip_srcroute(m), sizeof(struct in_addr),
 		    IP_RECVRETOPTS, IPPROTO_IP, M_NOWAIT);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 #endif
 	if (inp->inp_flags & INP_RECVIF) {
 		struct ifnet *ifp;
 		struct sdlbuf {
 			struct sockaddr_dl sdl;
 			u_char	pad[32];
 		} sdlbuf;
 		struct sockaddr_dl *sdp;
 		struct sockaddr_dl *sdl2 = &sdlbuf.sdl;
 
 		if ((ifp = m->m_pkthdr.rcvif)) {
 			sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr;
 			/*
 			 * Change our mind and don't try copy.
 			 */
 			if (sdp->sdl_family != AF_LINK ||
 			    sdp->sdl_len > sizeof(sdlbuf)) {
 				goto makedummy;
 			}
 			bcopy(sdp, sdl2, sdp->sdl_len);
 		} else {
 makedummy:
 			sdl2->sdl_len =
 			    offsetof(struct sockaddr_dl, sdl_data[0]);
 			sdl2->sdl_family = AF_LINK;
 			sdl2->sdl_index = 0;
 			sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0;
 		}
 		*mp = sbcreatecontrol(sdl2, sdl2->sdl_len, IP_RECVIF,
 		    IPPROTO_IP, M_NOWAIT);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 	if (inp->inp_flags & INP_RECVTOS) {
 		*mp = sbcreatecontrol(&ip->ip_tos, sizeof(u_char), IP_RECVTOS,
 		    IPPROTO_IP, M_NOWAIT);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 	if (inp->inp_flags2 & INP_RECVFLOWID) {
 		uint32_t flowid, flow_type;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		/*
 		 * XXX should handle the failure of one or the
 		 * other - don't populate both?
 		 */
 		*mp = sbcreatecontrol(&flowid, sizeof(uint32_t), IP_FLOWID,
 		    IPPROTO_IP, M_NOWAIT);
 		if (*mp)
 			mp = &(*mp)->m_next;
 		*mp = sbcreatecontrol(&flow_type, sizeof(uint32_t),
 		    IP_FLOWTYPE, IPPROTO_IP, M_NOWAIT);
 		if (*mp)
 			mp = &(*mp)->m_next;
 	}
 
 #ifdef	RSS
 	if (inp->inp_flags2 & INP_RECVRSSBUCKETID) {
 		uint32_t flowid, flow_type;
 		uint32_t rss_bucketid;
 
 		flowid = m->m_pkthdr.flowid;
 		flow_type = M_HASHTYPE_GET(m);
 
 		if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) {
 			*mp = sbcreatecontrol(&rss_bucketid, sizeof(uint32_t),
 			    IP_RSSBUCKETID, IPPROTO_IP, M_NOWAIT);
 			if (*mp)
 				mp = &(*mp)->m_next;
 		}
 	}
 #endif
 }
 
 /*
  * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the
  * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on
  * locking.  This code remains in ip_input.c as ip_mroute.c is optionally
  * compiled.
  */
 VNET_DEFINE_STATIC(int, ip_rsvp_on);
 VNET_DEFINE(struct socket *, ip_rsvpd);
 
 #define	V_ip_rsvp_on		VNET(ip_rsvp_on)
 
 int
 ip_rsvp_init(struct socket *so)
 {
 
 	if (V_ip_rsvpd != NULL)
 		return EADDRINUSE;
 
 	V_ip_rsvpd = so;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-increment
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (!V_ip_rsvp_on) {
 		V_ip_rsvp_on = 1;
 		V_rsvp_on++;
 	}
 
 	return 0;
 }
 
 int
 ip_rsvp_done(void)
 {
 
 	V_ip_rsvpd = NULL;
 	/*
 	 * This may seem silly, but we need to be sure we don't over-decrement
 	 * the RSVP counter, in case something slips up.
 	 */
 	if (V_ip_rsvp_on) {
 		V_ip_rsvp_on = 0;
 		V_rsvp_on--;
 	}
 	return 0;
 }
 
 int
 rsvp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 
 	m = *mp;
 	*mp = NULL;
 
 	if (rsvp_input_p) { /* call the real one if loaded */
 		*mp = m;
 		rsvp_input_p(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 
 	/* Can still get packets with rsvp_on = 0 if there is a local member
 	 * of the group to which the RSVP packet is addressed.  But in this
 	 * case we want to throw the packet away.
 	 */
 
 	if (!V_rsvp_on) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (V_ip_rsvpd != NULL) {
 		*mp = m;
 		rip_input(mp, offp, proto);
 		return (IPPROTO_DONE);
 	}
 	/* Drop the packet */
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c
index 383f2dfc076e..6cabef8a1b16 100644
--- a/sys/netinet/ip_mroute.c
+++ b/sys/netinet/ip_mroute.c
@@ -1,2904 +1,2905 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989 Stephen Deering
  * Copyright (c) 1992, 1993
  *      The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
  */
 
 /*
  * IP multicast forwarding procedures
  *
  * Written by David Waitzman, BBN Labs, August 1988.
  * Modified by Steve Deering, Stanford, February 1989.
  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  * Modified by Van Jacobson, LBL, January 1993
  * Modified by Ajit Thyagarajan, PARC, August 1993
  * Modified by Bill Fenner, PARC, April 1995
  * Modified by Ahmed Helmy, SGI, June 1996
  * Modified by George Edmond Eddy (Rusty), ISI, February 1998
  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
  * Modified by Hitoshi Asaeda, WIDE, August 2000
  * Modified by Pavlin Radoslavov, ICSI, October 2002
  * Modified by Wojciech Macek, Semihalf, May 2021
  *
  * MROUTING Revision: 3.5
  * and PIM-SMv2 and PIM-DM support, advanced API support,
  * bandwidth metering and signaling
  */
 
 /*
  * TODO: Prefix functions with ipmf_.
  * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol
  * domain attachment (if_afdata) so we can track consumers of that service.
  * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT,
  * move it to socket options.
  * TODO: Cleanup LSRR removal further.
  * TODO: Push RSVP stubs into raw_ip.c.
  * TODO: Use bitstring.h for vif set.
  * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded.
  * TODO: Sync ip6_mroute.c with this file.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_mrouting.h"
 
 #define _PIM_VT 1
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/stddef.h>
 #include <sys/condvar.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/counter.h>
 #include <machine/atomic.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/igmp.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_mroute.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/pim.h>
 #include <netinet/pim_var.h>
 #include <netinet/udp.h>
 
 #include <machine/in_cksum.h>
 
 #ifndef KTR_IPMF
 #define KTR_IPMF KTR_INET
 #endif
 
 #define		VIFI_INVALID	((vifi_t) -1)
 
 static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache");
 
 /*
  * Locking.  We use two locks: one for the virtual interface table and
  * one for the forwarding table.  These locks may be nested in which case
  * the VIF lock must always be taken first.  Note that each lock is used
  * to cover not only the specific data structure but also related data
  * structures.
  */
 
 static struct rwlock mrouter_mtx;
 #define	MRW_RLOCK()		rw_rlock(&mrouter_mtx)
 #define	MRW_WLOCK()		rw_wlock(&mrouter_mtx)
 #define	MRW_RUNLOCK()	rw_runlock(&mrouter_mtx)
 #define	MRW_WUNLOCK()	rw_wunlock(&mrouter_mtx)
 #define	MRW_UNLOCK()	rw_unlock(&mrouter_mtx)
 #define	MRW_LOCK_ASSERT()	rw_assert(&mrouter_mtx, RA_LOCKED)
 #define	MRW_WLOCK_ASSERT()	rw_assert(&mrouter_mtx, RA_WLOCKED)
 #define	MRW_LOCK_TRY_UPGRADE()	rw_try_upgrade(&mrouter_mtx)
 #define	MRW_WOWNED()	rw_wowned(&mrouter_mtx)
 #define	MRW_LOCK_INIT()						\
 	rw_init(&mrouter_mtx, "IPv4 multicast forwarding")
 #define	MRW_LOCK_DESTROY()	rw_destroy(&mrouter_mtx)
 
 static int ip_mrouter_cnt;	/* # of vnets with active mrouters */
 static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */
 
 VNET_PCPUSTAT_DEFINE_STATIC(struct mrtstat, mrtstat);
 VNET_PCPUSTAT_SYSINIT(mrtstat);
 VNET_PCPUSTAT_SYSUNINIT(mrtstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat,
     mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, "
     "netinet/ip_mroute.h)");
 
 VNET_DEFINE_STATIC(u_long, mfchash);
 #define	V_mfchash		VNET(mfchash)
 #define	MFCHASH(a, g)							\
 	((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \
 	  ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash)
 #define	MFCHASHSIZE	256
 
 static u_long mfchashsize;			/* Hash size */
 VNET_DEFINE_STATIC(u_char *, nexpire);		/* 0..mfchashsize-1 */
 #define	V_nexpire		VNET(nexpire)
 VNET_DEFINE_STATIC(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl);
 #define	V_mfchashtbl		VNET(mfchashtbl)
 VNET_DEFINE_STATIC(struct taskqueue *, task_queue);
 #define	V_task_queue		VNET(task_queue)
 VNET_DEFINE_STATIC(struct task, task);
 #define	V_task		VNET(task)
 
 VNET_DEFINE_STATIC(vifi_t, numvifs);
 #define	V_numvifs		VNET(numvifs)
 VNET_DEFINE_STATIC(struct vif *, viftable);
 #define	V_viftable		VNET(viftable)
 
 static eventhandler_tag if_detach_event_tag = NULL;
 
 VNET_DEFINE_STATIC(struct callout, expire_upcalls_ch);
 #define	V_expire_upcalls_ch	VNET(expire_upcalls_ch)
 
 VNET_DEFINE_STATIC(struct mtx, buf_ring_mtx);
 #define	V_buf_ring_mtx	VNET(buf_ring_mtx)
 
 #define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second		*/
 #define		UPCALL_EXPIRE	6		/* number of timeouts	*/
 
 /*
  * Bandwidth meter variables and constants
  */
 static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
 
 /*
  * Pending upcalls are stored in a ring which is flushed when
  * full, or periodically
  */
 VNET_DEFINE_STATIC(struct callout, bw_upcalls_ch);
 #define	V_bw_upcalls_ch		VNET(bw_upcalls_ch)
 VNET_DEFINE_STATIC(struct buf_ring *, bw_upcalls_ring);
 #define	V_bw_upcalls_ring    	VNET(bw_upcalls_ring)
 VNET_DEFINE_STATIC(struct mtx, bw_upcalls_ring_mtx);
 #define	V_bw_upcalls_ring_mtx    	VNET(bw_upcalls_ring_mtx)
 
 #define BW_UPCALLS_PERIOD (hz)		/* periodical flush of bw upcalls */
 
 VNET_PCPUSTAT_DEFINE_STATIC(struct pimstat, pimstat);
 VNET_PCPUSTAT_SYSINIT(pimstat);
 VNET_PCPUSTAT_SYSUNINIT(pimstat);
 
 SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "PIM");
 SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat,
     pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)");
 
 static u_long	pim_squelch_wholepkt = 0;
 SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW,
     &pim_squelch_wholepkt, 0,
     "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified");
 
 static const struct encaptab *pim_encap_cookie;
 static int pim_encapcheck(const struct mbuf *, int, int, void *);
 static int pim_input(struct mbuf *, int, int, void *);
 
 extern int in_mcast_loop;
 
 static const struct encap_config ipv4_encap_cfg = {
 	.proto = IPPROTO_PIM,
 	.min_length = sizeof(struct ip) + PIM_MINLEN,
 	.exact_match = 8,
 	.check = pim_encapcheck,
 	.input = pim_input
 };
 
 /*
  * Note: the PIM Register encapsulation adds the following in front of a
  * data packet:
  *
  * struct pim_encap_hdr {
  *    struct ip ip;
  *    struct pim_encap_pimhdr  pim;
  * }
  *
  */
 
 struct pim_encap_pimhdr {
 	struct pim pim;
 	uint32_t   flags;
 };
 #define		PIM_ENCAP_TTL	64
 
 static struct ip pim_encap_iphdr = {
 #if BYTE_ORDER == LITTLE_ENDIAN
 	sizeof(struct ip) >> 2,
 	IPVERSION,
 #else
 	IPVERSION,
 	sizeof(struct ip) >> 2,
 #endif
 	0,			/* tos */
 	sizeof(struct ip),	/* total length */
 	0,			/* id */
 	0,			/* frag offset */
 	PIM_ENCAP_TTL,
 	IPPROTO_PIM,
 	0,			/* checksum */
 };
 
 static struct pim_encap_pimhdr pim_encap_pimhdr = {
     {
 	PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
 	0,			/* reserved */
 	0,			/* checksum */
     },
     0				/* flags */
 };
 
 VNET_DEFINE_STATIC(vifi_t, reg_vif_num) = VIFI_INVALID;
 #define	V_reg_vif_num		VNET(reg_vif_num)
 VNET_DEFINE_STATIC(struct ifnet *, multicast_register_if);
 #define	V_multicast_register_if	VNET(multicast_register_if)
 
 /*
  * Private variables.
  */
 
 static u_long	X_ip_mcast_src(int);
 static int	X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *,
 		    struct ip_moptions *);
 static int	X_ip_mrouter_done(void);
 static int	X_ip_mrouter_get(struct socket *, struct sockopt *);
 static int	X_ip_mrouter_set(struct socket *, struct sockopt *);
 static int	X_legal_vif_num(int);
 static int	X_mrt_ioctl(u_long, caddr_t, int);
 
 static int	add_bw_upcall(struct bw_upcall *);
 static int	add_mfc(struct mfcctl2 *);
 static int	add_vif(struct vifctl *);
 static void	bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
 static void	bw_meter_geq_receive_packet(struct bw_meter *, int,
 		    struct timeval *);
 static void	bw_upcalls_send(void);
 static int	del_bw_upcall(struct bw_upcall *);
 static int	del_mfc(struct mfcctl2 *);
 static int	del_vif(vifi_t);
 static int	del_vif_locked(vifi_t, struct ifnet **);
 static void	expire_bw_upcalls_send(void *);
 static void	expire_mfc(struct mfc *);
 static void	expire_upcalls(void *);
 static void	free_bw_list(struct bw_meter *);
 static int	get_sg_cnt(struct sioc_sg_req *);
 static int	get_vif_cnt(struct sioc_vif_req *);
 static void	if_detached_event(void *, struct ifnet *);
 static int	ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
 static int	ip_mrouter_init(struct socket *, int);
 static __inline struct mfc *
 		mfc_find(struct in_addr *, struct in_addr *);
 static void	phyint_send(struct ip *, struct vif *, struct mbuf *);
 static struct mbuf *
 		pim_register_prepare(struct ip *, struct mbuf *);
 static int	pim_register_send(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static int	pim_register_send_rp(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static int	pim_register_send_upcall(struct ip *, struct vif *,
 		    struct mbuf *, struct mfc *);
 static void	send_packet(struct vif *, struct mbuf *);
 static int	set_api_config(uint32_t *);
 static int	set_assert(int);
 static int	socket_send(struct socket *, struct mbuf *,
 		    struct sockaddr_in *);
 
 /*
  * Kernel multicast forwarding API capabilities and setup.
  * If more API capabilities are added to the kernel, they should be
  * recorded in `mrt_api_support'.
  */
 #define MRT_API_VERSION		0x0305
 
 static const int mrt_api_version = MRT_API_VERSION;
 static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
 					 MRT_MFC_FLAGS_BORDER_VIF |
 					 MRT_MFC_RP |
 					 MRT_MFC_BW_UPCALL);
 VNET_DEFINE_STATIC(uint32_t, mrt_api_config);
 #define	V_mrt_api_config	VNET(mrt_api_config)
 VNET_DEFINE_STATIC(int, pim_assert_enabled);
 #define	V_pim_assert_enabled	VNET(pim_assert_enabled)
 static struct timeval pim_assert_interval = { 3, 0 };	/* Rate limit */
 
 /*
  * Find a route for a given origin IP address and multicast group address.
  * Statistics must be updated by the caller.
  */
 static __inline struct mfc *
 mfc_find(struct in_addr *o, struct in_addr *g)
 {
 	struct mfc *rt;
 
 	/*
 	 * Might be called both RLOCK and WLOCK.
 	 * Check if any, it's caller responsibility
 	 * to choose correct option.
 	 */
 	MRW_LOCK_ASSERT();
 
 	LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
 		if (in_hosteq(rt->mfc_origin, *o) &&
 		    in_hosteq(rt->mfc_mcastgrp, *g) &&
 		    buf_ring_empty(rt->mfc_stall_ring))
 			break;
 	}
 
 	return (rt);
 }
 
 static __inline struct mfc *
 mfc_alloc(void)
 {
 	struct mfc *rt;
 	rt = (struct mfc*) malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT | M_ZERO);
 	if (rt == NULL)
 		return rt;
 
 	rt->mfc_stall_ring = buf_ring_alloc(MAX_UPQ, M_MRTABLE,
 	    M_NOWAIT, &V_buf_ring_mtx);
 	if (rt->mfc_stall_ring == NULL) {
 		free(rt, M_MRTABLE);
 		return NULL;
 	}
 
 	return rt;
 }
 
 /*
  * Handle MRT setsockopt commands to modify the multicast forwarding tables.
  */
 static int
 X_ip_mrouter_set(struct socket *so, struct sockopt *sopt)
 {
     int	error, optval;
     vifi_t	vifi;
     struct	vifctl vifc;
     struct	mfcctl2 mfc;
     struct	bw_upcall bw_upcall;
     uint32_t	i;
 
     if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT)
 	return EPERM;
 
     error = 0;
     switch (sopt->sopt_name) {
     case MRT_INIT:
 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
 	if (error)
 	    break;
 	error = ip_mrouter_init(so, optval);
 	break;
 
     case MRT_DONE:
 	error = ip_mrouter_done();
 	break;
 
     case MRT_ADD_VIF:
 	error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
 	if (error)
 	    break;
 	error = add_vif(&vifc);
 	break;
 
     case MRT_DEL_VIF:
 	error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
 	if (error)
 	    break;
 	error = del_vif(vifi);
 	break;
 
     case MRT_ADD_MFC:
     case MRT_DEL_MFC:
 	/*
 	 * select data size depending on API version.
 	 */
 	if (sopt->sopt_name == MRT_ADD_MFC &&
 		V_mrt_api_config & MRT_API_FLAGS_ALL) {
 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2),
 				sizeof(struct mfcctl2));
 	} else {
 	    error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl),
 				sizeof(struct mfcctl));
 	    bzero((caddr_t)&mfc + sizeof(struct mfcctl),
 			sizeof(mfc) - sizeof(struct mfcctl));
 	}
 	if (error)
 	    break;
 	if (sopt->sopt_name == MRT_ADD_MFC)
 	    error = add_mfc(&mfc);
 	else
 	    error = del_mfc(&mfc);
 	break;
 
     case MRT_ASSERT:
 	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
 	if (error)
 	    break;
 	set_assert(optval);
 	break;
 
     case MRT_API_CONFIG:
 	error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
 	if (!error)
 	    error = set_api_config(&i);
 	if (!error)
 	    error = sooptcopyout(sopt, &i, sizeof i);
 	break;
 
     case MRT_ADD_BW_UPCALL:
     case MRT_DEL_BW_UPCALL:
 	error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall,
 				sizeof bw_upcall);
 	if (error)
 	    break;
 	if (sopt->sopt_name == MRT_ADD_BW_UPCALL)
 	    error = add_bw_upcall(&bw_upcall);
 	else
 	    error = del_bw_upcall(&bw_upcall);
 	break;
 
     default:
 	error = EOPNOTSUPP;
 	break;
     }
     return error;
 }
 
 /*
  * Handle MRT getsockopt commands
  */
 static int
 X_ip_mrouter_get(struct socket *so, struct sockopt *sopt)
 {
     int error;
 
     switch (sopt->sopt_name) {
     case MRT_VERSION:
 	error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version);
 	break;
 
     case MRT_ASSERT:
 	error = sooptcopyout(sopt, &V_pim_assert_enabled,
 	    sizeof V_pim_assert_enabled);
 	break;
 
     case MRT_API_SUPPORT:
 	error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support);
 	break;
 
     case MRT_API_CONFIG:
 	error = sooptcopyout(sopt, &V_mrt_api_config, sizeof V_mrt_api_config);
 	break;
 
     default:
 	error = EOPNOTSUPP;
 	break;
     }
     return error;
 }
 
 /*
  * Handle ioctl commands to obtain information from the cache
  */
 static int
 X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused)
 {
     int error = 0;
 
     /*
      * Currently the only function calling this ioctl routine is rtioctl_fib().
      * Typically, only root can create the raw socket in order to execute
      * this ioctl method, however the request might be coming from a prison
      */
     error = priv_check(curthread, PRIV_NETINET_MROUTE);
     if (error)
 	return (error);
     switch (cmd) {
     case (SIOCGETVIFCNT):
 	error = get_vif_cnt((struct sioc_vif_req *)data);
 	break;
 
     case (SIOCGETSGCNT):
 	error = get_sg_cnt((struct sioc_sg_req *)data);
 	break;
 
     default:
 	error = EINVAL;
 	break;
     }
     return error;
 }
 
 /*
  * returns the packet, byte, rpf-failure count for the source group provided
  */
 static int
 get_sg_cnt(struct sioc_sg_req *req)
 {
     struct mfc *rt;
 
     MRW_RLOCK();
     rt = mfc_find(&req->src, &req->grp);
     if (rt == NULL) {
 	    MRW_RUNLOCK();
 	req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
 	return EADDRNOTAVAIL;
     }
     req->pktcnt = rt->mfc_pkt_cnt;
     req->bytecnt = rt->mfc_byte_cnt;
     req->wrong_if = rt->mfc_wrong_if;
     MRW_RUNLOCK();
     return 0;
 }
 
 /*
  * returns the input and output packet and byte counts on the vif provided
  */
 static int
 get_vif_cnt(struct sioc_vif_req *req)
 {
     vifi_t vifi = req->vifi;
 
     MRW_RLOCK();
     if (vifi >= V_numvifs) {
 	MRW_RUNLOCK();
 	return EINVAL;
     }
 
     mtx_lock_spin(&V_viftable[vifi].v_spin);
     req->icount = V_viftable[vifi].v_pkt_in;
     req->ocount = V_viftable[vifi].v_pkt_out;
     req->ibytes = V_viftable[vifi].v_bytes_in;
     req->obytes = V_viftable[vifi].v_bytes_out;
     mtx_unlock_spin(&V_viftable[vifi].v_spin);
     MRW_RUNLOCK();
 
     return 0;
 }
 
 static void
 if_detached_event(void *arg __unused, struct ifnet *ifp)
 {
     vifi_t vifi;
     u_long i, vifi_cnt = 0;
     struct ifnet *free_ptr;
 
     MRW_WLOCK();
 
     if (V_ip_mrouter == NULL) {
 	MRW_WUNLOCK();
 	return;
     }
 
     /*
      * Tear down multicast forwarder state associated with this ifnet.
      * 1. Walk the vif list, matching vifs against this ifnet.
      * 2. Walk the multicast forwarding cache (mfc) looking for
      *    inner matches with this vif's index.
      * 3. Expire any matching multicast forwarding cache entries.
      * 4. Free vif state. This should disable ALLMULTI on the interface.
      */
     for (vifi = 0; vifi < V_numvifs; vifi++) {
 	if (V_viftable[vifi].v_ifp != ifp)
 		continue;
 	for (i = 0; i < mfchashsize; i++) {
 		struct mfc *rt, *nrt;
 
 		LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 			if (rt->mfc_parent == vifi) {
 				expire_mfc(rt);
 			}
 		}
 	}
 	del_vif_locked(vifi, &free_ptr);
 	if (free_ptr != NULL)
 		vifi_cnt++;
     }
 
     MRW_WUNLOCK();
 
     /*
      * Free IFP. We don't have to use free_ptr here as it is the same
      * that ifp. Perform free as many times as required in case
      * refcount is greater than 1.
      */
     for (i = 0; i < vifi_cnt; i++)
 	    if_free(ifp);
 }
 
 static void
 ip_mrouter_upcall_thread(void *arg, int pending __unused)
 {
 	CURVNET_SET((struct vnet *) arg);
 
 	MRW_WLOCK();
 	bw_upcalls_send();
 	MRW_WUNLOCK();
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Enable multicast forwarding.
  */
 static int
 ip_mrouter_init(struct socket *so, int version)
 {
 
     CTR2(KTR_IPMF, "%s: so %p", __func__, so);
 
     if (version != 1)
 	return ENOPROTOOPT;
 
     MRW_WLOCK();
 
     if (ip_mrouter_unloading) {
 	MRW_WUNLOCK();
 	return ENOPROTOOPT;
     }
 
     if (V_ip_mrouter != NULL) {
 	MRW_WUNLOCK();
 	return EADDRINUSE;
     }
 
     V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash,
 	HASH_NOWAIT);
 
     /* Create upcall ring */
     mtx_init(&V_bw_upcalls_ring_mtx, "mroute upcall buf_ring mtx", NULL, MTX_DEF);
     V_bw_upcalls_ring = buf_ring_alloc(BW_UPCALLS_MAX, M_MRTABLE,
 	M_NOWAIT, &V_bw_upcalls_ring_mtx);
     if (!V_bw_upcalls_ring) {
 	MRW_WUNLOCK();
 	return (ENOMEM);
     }
 
     TASK_INIT(&V_task, 0, ip_mrouter_upcall_thread, curvnet);
     taskqueue_cancel(V_task_queue, &V_task, NULL);
     taskqueue_unblock(V_task_queue);
 
     callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
 	curvnet);
     callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
 	curvnet);
 
     V_ip_mrouter = so;
     atomic_add_int(&ip_mrouter_cnt, 1);
 
     /* This is a mutex required by buf_ring init, but not used internally */
     mtx_init(&V_buf_ring_mtx, "mroute buf_ring mtx", NULL, MTX_DEF);
 
     MRW_WUNLOCK();
 
     CTR1(KTR_IPMF, "%s: done", __func__);
 
     return 0;
 }
 
 /*
  * Disable multicast forwarding.
  */
 static int
 X_ip_mrouter_done(void)
 {
     struct ifnet **ifps;
     int nifp;
     u_long i;
     vifi_t vifi;
     struct bw_upcall *bu;
 
     if (V_ip_mrouter == NULL)
 	return (EINVAL);
 
     /*
      * Detach/disable hooks to the reset of the system.
      */
     V_ip_mrouter = NULL;
     atomic_subtract_int(&ip_mrouter_cnt, 1);
     V_mrt_api_config = 0;
 
     /*
      * Wait for all epoch sections to complete to ensure
      * V_ip_mrouter = NULL is visible to others.
      */
     epoch_wait_preempt(net_epoch_preempt);
 
     /* Stop and drain task queue */
     taskqueue_block(V_task_queue);
     while (taskqueue_cancel(V_task_queue, &V_task, NULL)) {
     	taskqueue_drain(V_task_queue, &V_task);
     }
 
     ifps = malloc(MAXVIFS * sizeof(*ifps), M_TEMP, M_WAITOK);
 
     MRW_WLOCK();
     taskqueue_cancel(V_task_queue, &V_task, NULL);
 
     /* Destroy upcall ring */
     while ((bu = buf_ring_dequeue_mc(V_bw_upcalls_ring)) != NULL) {
 	free(bu, M_MRTABLE);
     }
     buf_ring_free(V_bw_upcalls_ring, M_MRTABLE);
     mtx_destroy(&V_bw_upcalls_ring_mtx);
 
     /*
      * For each phyint in use, prepare to disable promiscuous reception
      * of all IP multicasts.  Defer the actual call until the lock is released;
      * just record the list of interfaces while locked.  Some interfaces use
      * sx locks in their ioctl routines, which is not allowed while holding
      * a non-sleepable lock.
      */
     KASSERT(V_numvifs <= MAXVIFS, ("More vifs than possible"));
     for (vifi = 0, nifp = 0; vifi < V_numvifs; vifi++) {
 	if (!in_nullhost(V_viftable[vifi].v_lcl_addr) &&
 		!(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
 	    ifps[nifp++] = V_viftable[vifi].v_ifp;
 	}
     }
     bzero((caddr_t)V_viftable, sizeof(*V_viftable) * MAXVIFS);
     V_numvifs = 0;
     V_pim_assert_enabled = 0;
 
     callout_stop(&V_expire_upcalls_ch);
     callout_stop(&V_bw_upcalls_ch);
 
     /*
      * Free all multicast forwarding cache entries.
      * Do not use hashdestroy(), as we must perform other cleanup.
      */
     for (i = 0; i < mfchashsize; i++) {
 	struct mfc *rt, *nrt;
 
 	LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 		expire_mfc(rt);
 	}
     }
     free(V_mfchashtbl, M_MRTABLE);
     V_mfchashtbl = NULL;
 
     bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize);
 
     V_reg_vif_num = VIFI_INVALID;
 
     mtx_destroy(&V_buf_ring_mtx);
 
     MRW_WUNLOCK();
 
     /*
      * Now drop our claim on promiscuous multicast on the interfaces recorded
      * above.  This is safe to do now because ALLMULTI is reference counted.
      */
     for (vifi = 0; vifi < nifp; vifi++)
 	    if_allmulti(ifps[vifi], 0);
     free(ifps, M_TEMP);
 
     CTR1(KTR_IPMF, "%s: done", __func__);
 
     return 0;
 }
 
 /*
  * Set PIM assert processing global
  */
 static int
 set_assert(int i)
 {
     if ((i != 1) && (i != 0))
 	return EINVAL;
 
     V_pim_assert_enabled = i;
 
     return 0;
 }
 
 /*
  * Configure API capabilities
  */
 int
 set_api_config(uint32_t *apival)
 {
     u_long i;
 
     /*
      * We can set the API capabilities only if it is the first operation
      * after MRT_INIT. I.e.:
      *  - there are no vifs installed
      *  - pim_assert is not enabled
      *  - the MFC table is empty
      */
     if (V_numvifs > 0) {
 	*apival = 0;
 	return EPERM;
     }
     if (V_pim_assert_enabled) {
 	*apival = 0;
 	return EPERM;
     }
 
     MRW_RLOCK();
 
     for (i = 0; i < mfchashsize; i++) {
 	if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) {
 	    MRW_RUNLOCK();
 	    *apival = 0;
 	    return EPERM;
 	}
     }
 
     MRW_RUNLOCK();
 
     V_mrt_api_config = *apival & mrt_api_support;
     *apival = V_mrt_api_config;
 
     return 0;
 }
 
 /*
  * Add a vif to the vif table
  */
 static int
 add_vif(struct vifctl *vifcp)
 {
     struct vif *vifp = V_viftable + vifcp->vifc_vifi;
     struct sockaddr_in sin = {sizeof sin, AF_INET};
     struct ifaddr *ifa;
     struct ifnet *ifp;
     int error;
 
 
     if (vifcp->vifc_vifi >= MAXVIFS)
 	return EINVAL;
     /* rate limiting is no longer supported by this code */
     if (vifcp->vifc_rate_limit != 0) {
 	log(LOG_ERR, "rate limiting is no longer supported\n");
 	return EINVAL;
     }
 
     if (in_nullhost(vifcp->vifc_lcl_addr))
 	return EADDRNOTAVAIL;
 
     /* Find the interface with an address in AF_INET family */
     if (vifcp->vifc_flags & VIFF_REGISTER) {
 	/*
 	 * XXX: Because VIFF_REGISTER does not really need a valid
 	 * local interface (e.g. it could be 127.0.0.2), we don't
 	 * check its address.
 	 */
 	ifp = NULL;
     } else {
 	struct epoch_tracker et;
 
 	sin.sin_addr = vifcp->vifc_lcl_addr;
 	NET_EPOCH_ENTER(et);
 	ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
 	if (ifa == NULL) {
 	    NET_EPOCH_EXIT(et);
 	    return EADDRNOTAVAIL;
 	}
 	ifp = ifa->ifa_ifp;
 	/* XXX FIXME we need to take a ref on ifp and cleanup properly! */
 	NET_EPOCH_EXIT(et);
     }
 
     if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) {
 	CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__);
 	return EOPNOTSUPP;
     } else if (vifcp->vifc_flags & VIFF_REGISTER) {
 	ifp = V_multicast_register_if = if_alloc(IFT_LOOP);
 	CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp);
 	if (V_reg_vif_num == VIFI_INVALID) {
 	    if_initname(V_multicast_register_if, "register_vif", 0);
 	    V_reg_vif_num = vifcp->vifc_vifi;
 	}
     } else {		/* Make sure the interface supports multicast */
 	if ((ifp->if_flags & IFF_MULTICAST) == 0)
 	    return EOPNOTSUPP;
 
 	/* Enable promiscuous reception of all IP multicasts from the if */
 	error = if_allmulti(ifp, 1);
 	if (error)
 	    return error;
     }
 
     MRW_WLOCK();
 
     if (!in_nullhost(vifp->v_lcl_addr)) {
 	if (ifp)
 		V_multicast_register_if = NULL;
 	MRW_WUNLOCK();
 	if (ifp)
 		if_free(ifp);
 	return EADDRINUSE;
     }
 
     vifp->v_flags     = vifcp->vifc_flags;
     vifp->v_threshold = vifcp->vifc_threshold;
     vifp->v_lcl_addr  = vifcp->vifc_lcl_addr;
     vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
     vifp->v_ifp       = ifp;
     /* initialize per vif pkt counters */
     vifp->v_pkt_in    = 0;
     vifp->v_pkt_out   = 0;
     vifp->v_bytes_in  = 0;
     vifp->v_bytes_out = 0;
     sprintf(vifp->v_spin_name, "BM[%d] spin", vifcp->vifc_vifi);
     mtx_init(&vifp->v_spin, vifp->v_spin_name, NULL, MTX_SPIN);
 
     /* Adjust numvifs up if the vifi is higher than numvifs */
     if (V_numvifs <= vifcp->vifc_vifi)
 	V_numvifs = vifcp->vifc_vifi + 1;
 
     MRW_WUNLOCK();
 
     CTR4(KTR_IPMF, "%s: add vif %d laddr 0x%08x thresh %x", __func__,
 	(int)vifcp->vifc_vifi, ntohl(vifcp->vifc_lcl_addr.s_addr),
 	(int)vifcp->vifc_threshold);
 
     return 0;
 }
 
 /*
  * Delete a vif from the vif table
  */
 static int
 del_vif_locked(vifi_t vifi, struct ifnet **ifp_free)
 {
     struct vif *vifp;
 
     *ifp_free = NULL;
 
     MRW_WLOCK_ASSERT();
 
     if (vifi >= V_numvifs) {
 	return EINVAL;
     }
     vifp = &V_viftable[vifi];
     if (in_nullhost(vifp->v_lcl_addr)) {
 	return EADDRNOTAVAIL;
     }
 
     if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER)))
 	if_allmulti(vifp->v_ifp, 0);
 
     if (vifp->v_flags & VIFF_REGISTER) {
 	V_reg_vif_num = VIFI_INVALID;
 	if (vifp->v_ifp) {
 	    if (vifp->v_ifp == V_multicast_register_if)
 	        V_multicast_register_if = NULL;
 	    *ifp_free = vifp->v_ifp;
 	}
     }
 
     mtx_destroy(&vifp->v_spin);
 
     bzero((caddr_t)vifp, sizeof (*vifp));
 
     CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi);
 
     /* Adjust numvifs down */
     for (vifi = V_numvifs; vifi > 0; vifi--)
 	if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr))
 	    break;
     V_numvifs = vifi;
 
     return 0;
 }
 
 static int
 del_vif(vifi_t vifi)
 {
     int cc;
     struct ifnet *free_ptr;
 
     MRW_WLOCK();
     cc = del_vif_locked(vifi, &free_ptr);
     MRW_WUNLOCK();
 
     if (free_ptr)
 	    if_free(free_ptr);
 
     return cc;
 }
 
 /*
  * update an mfc entry without resetting counters and S,G addresses.
  */
 static void
 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 {
     int i;
 
     rt->mfc_parent = mfccp->mfcc_parent;
     for (i = 0; i < V_numvifs; i++) {
 	rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
 	rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config &
 	    MRT_MFC_FLAGS_ALL;
     }
     /* set the RP address */
     if (V_mrt_api_config & MRT_MFC_RP)
 	rt->mfc_rp = mfccp->mfcc_rp;
     else
 	rt->mfc_rp.s_addr = INADDR_ANY;
 }
 
 /*
  * fully initialize an mfc entry from the parameter.
  */
 static void
 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
 {
     rt->mfc_origin     = mfccp->mfcc_origin;
     rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
 
     update_mfc_params(rt, mfccp);
 
     /* initialize pkt counters per src-grp */
     rt->mfc_pkt_cnt    = 0;
     rt->mfc_byte_cnt   = 0;
     rt->mfc_wrong_if   = 0;
     timevalclear(&rt->mfc_last_assert);
 }
 
 static void
 expire_mfc(struct mfc *rt)
 {
 	struct rtdetq *rte;
 
 	MRW_WLOCK_ASSERT();
 
 	free_bw_list(rt->mfc_bw_meter_leq);
 	free_bw_list(rt->mfc_bw_meter_geq);
 
 	while (!buf_ring_empty(rt->mfc_stall_ring)) {
 		rte = buf_ring_dequeue_mc(rt->mfc_stall_ring);
 		if (rte) {
 			m_freem(rte->m);
 			free(rte, M_MRTABLE);
 		}
 	}
 	buf_ring_free(rt->mfc_stall_ring, M_MRTABLE);
 
 	LIST_REMOVE(rt, mfc_hash);
 	free(rt, M_MRTABLE);
 }
 
 /*
  * Add an mfc entry
  */
 static int
 add_mfc(struct mfcctl2 *mfccp)
 {
     struct mfc *rt;
     struct rtdetq *rte;
     u_long hash = 0;
     u_short nstl;
     struct epoch_tracker et;
 
     MRW_WLOCK();
     rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
 
     /* If an entry already exists, just update the fields */
     if (rt) {
 	CTR4(KTR_IPMF, "%s: update mfc orig 0x%08x group %lx parent %x",
 	    __func__, ntohl(mfccp->mfcc_origin.s_addr),
 	    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 	    mfccp->mfcc_parent);
 	update_mfc_params(rt, mfccp);
 	MRW_WUNLOCK();
 	return (0);
     }
 
     /*
      * Find the entry for which the upcall was made and update
      */
     nstl = 0;
     hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
     NET_EPOCH_ENTER(et);
     LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
 	if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 	    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
 	    !buf_ring_empty(rt->mfc_stall_ring)) {
 		CTR5(KTR_IPMF,
 		    "%s: add mfc orig 0x%08x group %lx parent %x qh %p",
 		    __func__, ntohl(mfccp->mfcc_origin.s_addr),
 		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
 		    mfccp->mfcc_parent,
 		    rt->mfc_stall_ring);
 		if (nstl++)
 			CTR1(KTR_IPMF, "%s: multiple matches", __func__);
 
 		init_mfc_params(rt, mfccp);
 		rt->mfc_expire = 0;	/* Don't clean this guy up */
 		V_nexpire[hash]--;
 
 		/* Free queued packets, but attempt to forward them first. */
 		while (!buf_ring_empty(rt->mfc_stall_ring)) {
 			rte = buf_ring_dequeue_mc(rt->mfc_stall_ring);
 			if (rte->ifp != NULL)
 				ip_mdq(rte->m, rte->ifp, rt, -1);
 			m_freem(rte->m);
 			free(rte, M_MRTABLE);
 		}
 	}
     }
     NET_EPOCH_EXIT(et);
 
     /*
      * It is possible that an entry is being inserted without an upcall
      */
     if (nstl == 0) {
 	CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__);
 	LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
 		if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
 		    in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
 			init_mfc_params(rt, mfccp);
 			if (rt->mfc_expire)
 			    V_nexpire[hash]--;
 			rt->mfc_expire = 0;
 			break; /* XXX */
 		}
 	}
 
 	if (rt == NULL) {		/* no upcall, so make a new entry */
 	    rt = mfc_alloc();
 	    if (rt == NULL) {
 		MRW_WUNLOCK();
 		return (ENOBUFS);
 	    }
 
 	    init_mfc_params(rt, mfccp);
 
 	    rt->mfc_expire     = 0;
 	    rt->mfc_bw_meter_leq = NULL;
 	    rt->mfc_bw_meter_geq = NULL;
 
 	    /* insert new entry at head of hash chain */
 	    LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
 	}
     }
 
     MRW_WUNLOCK();
 
     return (0);
 }
 
 /*
  * Delete an mfc entry
  */
 static int
 del_mfc(struct mfcctl2 *mfccp)
 {
     struct in_addr	origin;
     struct in_addr	mcastgrp;
     struct mfc		*rt;
 
     origin = mfccp->mfcc_origin;
     mcastgrp = mfccp->mfcc_mcastgrp;
 
     CTR3(KTR_IPMF, "%s: delete mfc orig 0x%08x group %lx", __func__,
 	ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr));
 
     MRW_WLOCK();
 
     rt = mfc_find(&origin, &mcastgrp);
     if (rt == NULL) {
 	MRW_WUNLOCK();
 	return EADDRNOTAVAIL;
     }
 
     /*
      * free the bw_meter entries
      */
     free_bw_list(rt->mfc_bw_meter_leq);
     rt->mfc_bw_meter_leq = NULL;
     free_bw_list(rt->mfc_bw_meter_geq);
     rt->mfc_bw_meter_geq = NULL;
 
     LIST_REMOVE(rt, mfc_hash);
     free(rt, M_MRTABLE);
 
     MRW_WUNLOCK();
 
     return (0);
 }
 
 /*
  * Send a message to the routing daemon on the multicast routing socket.
  */
 static int
 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
 {
     if (s) {
 	SOCKBUF_LOCK(&s->so_rcv);
 	if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm,
 	    NULL) != 0) {
 	    sorwakeup_locked(s);
 	    return 0;
 	}
 	soroverflow_locked(s);
     }
     m_freem(mm);
     return -1;
 }
 
 /*
  * IP multicast forwarding function. This function assumes that the packet
  * pointed to by "ip" has arrived on (or is about to be sent to) the interface
  * pointed to by "ifp", and the packet is to be relayed to other networks
  * that have members of the packet's destination IP multicast group.
  *
  * The packet is returned unscathed to the caller, unless it is
  * erroneous, in which case a non-zero return value tells the caller to
  * discard it.
  */
 
 #define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
 
 static int
 X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
     struct ip_moptions *imo)
 {
 	struct mfc *rt;
 	int error;
 	vifi_t vifi;
 	struct mbuf *mb0;
 	struct rtdetq *rte;
 	u_long hash;
 	int hlen;
 
 	CTR3(KTR_IPMF, "ip_mforward: delete mfc orig 0x%08x group %lx ifp %p",
 	    ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), ifp);
 
 	if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
 	    ((u_char *)(ip + 1))[1] != IPOPT_LSRR) {
 		/*
 		 * Packet arrived via a physical interface or
 		 * an encapsulated tunnel or a register_vif.
 		 */
 	} else {
 		/*
 		 * Packet arrived through a source-route tunnel.
 		 * Source-route tunnels are no longer supported.
 		 */
 		return (1);
 	}
 
 	/*
 	 * BEGIN: MCAST ROUTING HOT PATH
 	 */
 	MRW_RLOCK();
 	if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) {
 		if (ip->ip_ttl < MAXTTL)
 			ip->ip_ttl++; /* compensate for -1 in *_send routines */
 		error = ip_mdq(m, ifp, NULL, vifi);
 		MRW_RUNLOCK();
 		return error;
 	}
 
 	/*
 	 * Don't forward a packet with time-to-live of zero or one,
 	 * or a packet destined to a local-only group.
 	 */
 	if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) {
 		MRW_RUNLOCK();
 		return 0;
 	}
 
 	mfc_find_retry:
 	/*
 	 * Determine forwarding vifs from the forwarding cache table
 	 */
 	MRTSTAT_INC(mrts_mfc_lookups);
 	rt = mfc_find(&ip->ip_src, &ip->ip_dst);
 
 	/* Entry exists, so forward if necessary */
 	if (rt != NULL) {
 		error = ip_mdq(m, ifp, rt, -1);
 		/* Generic unlock here as we might release R or W lock */
 		MRW_UNLOCK();
 		return error;
 	}
 
 	/*
 	 * END: MCAST ROUTING HOT PATH
 	 */
 
 	/* Further processing must be done with WLOCK taken */
 	if ((MRW_WOWNED() == 0) && (MRW_LOCK_TRY_UPGRADE() == 0)) {
 		MRW_RUNLOCK();
 		MRW_WLOCK();
 		goto mfc_find_retry;
 	}
 
 	/*
 	 * If we don't have a route for packet's origin,
 	 * Make a copy of the packet & send message to routing daemon
 	 */
 	hlen = ip->ip_hl << 2;
 
 	MRTSTAT_INC(mrts_mfc_misses);
 	MRTSTAT_INC(mrts_no_route);
 	CTR2(KTR_IPMF, "ip_mforward: no mfc for (0x%08x,%lx)",
 	    ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr));
 
 	/*
 	 * Allocate mbufs early so that we don't do extra work if we are
 	 * just going to fail anyway.  Make sure to pullup the header so
 	 * that other people can't step on it.
 	 */
 	rte = (struct rtdetq*) malloc((sizeof *rte), M_MRTABLE,
 	    M_NOWAIT|M_ZERO);
 	if (rte == NULL) {
 		MRW_WUNLOCK();
 		return ENOBUFS;
 	}
 
 	mb0 = m_copypacket(m, M_NOWAIT);
 	if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen))
 		mb0 = m_pullup(mb0, hlen);
 	if (mb0 == NULL) {
 		free(rte, M_MRTABLE);
 		MRW_WUNLOCK();
 		return ENOBUFS;
 	}
 
 	/* is there an upcall waiting for this flow ? */
 	hash = MFCHASH(ip->ip_src, ip->ip_dst);
 	LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash)
 	{
 		if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
 		    in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
 		    !buf_ring_empty(rt->mfc_stall_ring))
 			break;
 	}
 
 	if (rt == NULL) {
 		int i;
 		struct igmpmsg *im;
 		struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 		struct mbuf *mm;
 
 		/*
 		 * Locate the vifi for the incoming interface for this packet.
 		 * If none found, drop packet.
 		 */
 		for (vifi = 0; vifi < V_numvifs &&
 		    V_viftable[vifi].v_ifp != ifp; vifi++)
 			;
 		if (vifi >= V_numvifs) /* vif not found, drop packet */
 			goto non_fatal;
 
 		/* no upcall, so make a new entry */
 		rt = mfc_alloc();
 		if (rt == NULL)
 			goto fail;
 
 		/* Make a copy of the header to send to the user level process */
 		mm = m_copym(mb0, 0, hlen, M_NOWAIT);
 		if (mm == NULL)
 			goto fail1;
 
 		/*
 		 * Send message to routing daemon to install
 		 * a route into the kernel table
 		 */
 
 		im = mtod(mm, struct igmpmsg*);
 		im->im_msgtype = IGMPMSG_NOCACHE;
 		im->im_mbz = 0;
 		im->im_vif = vifi;
 
 		MRTSTAT_INC(mrts_upcalls);
 
 		k_igmpsrc.sin_addr = ip->ip_src;
 		if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
 			CTR0(KTR_IPMF, "ip_mforward: socket queue full");
 			MRTSTAT_INC(mrts_upq_sockfull);
 			fail1: free(rt, M_MRTABLE);
 			fail: free(rte, M_MRTABLE);
 			m_freem(mb0);
 			MRW_WUNLOCK();
 			return ENOBUFS;
 		}
 
 		/* insert new entry at head of hash chain */
 		rt->mfc_origin.s_addr = ip->ip_src.s_addr;
 		rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr;
 		rt->mfc_expire = UPCALL_EXPIRE;
 		V_nexpire[hash]++;
 		for (i = 0; i < V_numvifs; i++) {
 			rt->mfc_ttls[i] = 0;
 			rt->mfc_flags[i] = 0;
 		}
 		rt->mfc_parent = -1;
 
 		/* clear the RP address */
 		rt->mfc_rp.s_addr = INADDR_ANY;
 		rt->mfc_bw_meter_leq = NULL;
 		rt->mfc_bw_meter_geq = NULL;
 
 		/* initialize pkt counters per src-grp */
 		rt->mfc_pkt_cnt = 0;
 		rt->mfc_byte_cnt = 0;
 		rt->mfc_wrong_if = 0;
 		timevalclear(&rt->mfc_last_assert);
 
 		buf_ring_enqueue(rt->mfc_stall_ring, rte);
 
 		/* Add RT to hashtable as it didn't exist before */
 		LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
 	} else {
 		/* determine if queue has overflowed */
 		if (buf_ring_full(rt->mfc_stall_ring)) {
 			MRTSTAT_INC(mrts_upq_ovflw);
 			non_fatal: free(rte, M_MRTABLE);
 			m_freem(mb0);
 			MRW_WUNLOCK();
 			return (0);
 		}
 
 		buf_ring_enqueue(rt->mfc_stall_ring, rte);
 	}
 
 	rte->m = mb0;
 	rte->ifp = ifp;
 
 	MRW_WUNLOCK();
 
 	return 0;
 }
 
 /*
  * Clean up the cache entry if upcall is not serviced
  */
 static void
 expire_upcalls(void *arg)
 {
     u_long i;
 
     CURVNET_SET((struct vnet *) arg);
 
     /*This callout is always run with MRW_WLOCK taken. */
 
     for (i = 0; i < mfchashsize; i++) {
 	struct mfc *rt, *nrt;
 
 	if (V_nexpire[i] == 0)
 	    continue;
 
 	LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
 		if (buf_ring_empty(rt->mfc_stall_ring))
 			continue;
 
 		if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
 			continue;
 
 		MRTSTAT_INC(mrts_cache_cleanups);
 		CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__,
 		    (u_long)ntohl(rt->mfc_origin.s_addr),
 		    (u_long)ntohl(rt->mfc_mcastgrp.s_addr));
 
 		expire_mfc(rt);
 	    }
     }
 
     callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
 	curvnet);
 
     CURVNET_RESTORE();
 }
 
 /*
  * Packet forwarding routine once entry in the cache is made
  */
 static int
 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
 {
     struct ip  *ip = mtod(m, struct ip *);
     vifi_t vifi;
     int plen = ntohs(ip->ip_len);
 
     MRW_LOCK_ASSERT();
     NET_EPOCH_ASSERT();
 
     /*
      * If xmt_vif is not -1, send on only the requested vif.
      *
      * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
      */
     if (xmt_vif < V_numvifs) {
 	if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER)
 		pim_register_send(ip, V_viftable + xmt_vif, m, rt);
 	else
 		phyint_send(ip, V_viftable + xmt_vif, m);
 	return 1;
     }
 
     /*
      * Don't forward if it didn't arrive from the parent vif for its origin.
      */
     vifi = rt->mfc_parent;
     if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) {
 	CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)",
 	    __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp);
 	MRTSTAT_INC(mrts_wrong_if);
 	++rt->mfc_wrong_if;
 	/*
 	 * If we are doing PIM assert processing, send a message
 	 * to the routing daemon.
 	 *
 	 * XXX: A PIM-SM router needs the WRONGVIF detection so it
 	 * can complete the SPT switch, regardless of the type
 	 * of the iif (broadcast media, GRE tunnel, etc).
 	 */
 	if (V_pim_assert_enabled && (vifi < V_numvifs) &&
 	    V_viftable[vifi].v_ifp) {
 	    if (ifp == V_multicast_register_if)
 		PIMSTAT_INC(pims_rcv_registers_wrongiif);
 
 	    /* Get vifi for the incoming packet */
 	    for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp;
 		vifi++)
 		;
 	    if (vifi >= V_numvifs)
 		return 0;	/* The iif is not found: ignore the packet. */
 
 	    if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF)
 		return 0;	/* WRONGVIF disabled: ignore the packet */
 
 	    if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) {
 		struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 		struct igmpmsg *im;
 		int hlen = ip->ip_hl << 2;
 		struct mbuf *mm = m_copym(m, 0, hlen, M_NOWAIT);
 
 		if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen))
 		    mm = m_pullup(mm, hlen);
 		if (mm == NULL)
 		    return ENOBUFS;
 
 		im = mtod(mm, struct igmpmsg *);
 		im->im_msgtype	= IGMPMSG_WRONGVIF;
 		im->im_mbz		= 0;
 		im->im_vif		= vifi;
 
 		MRTSTAT_INC(mrts_upcalls);
 
 		k_igmpsrc.sin_addr = im->im_src;
 		if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
 		    CTR1(KTR_IPMF, "%s: socket queue full", __func__);
 		    MRTSTAT_INC(mrts_upq_sockfull);
 		    return ENOBUFS;
 		}
 	    }
 	}
 	return 0;
     }
 
     /* If I sourced this packet, it counts as output, else it was input. */
     mtx_lock_spin(&V_viftable[vifi].v_spin);
     if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) {
 	V_viftable[vifi].v_pkt_out++;
 	V_viftable[vifi].v_bytes_out += plen;
     } else {
 	V_viftable[vifi].v_pkt_in++;
 	V_viftable[vifi].v_bytes_in += plen;
     }
     mtx_unlock_spin(&V_viftable[vifi].v_spin);
 
     rt->mfc_pkt_cnt++;
     rt->mfc_byte_cnt += plen;
 
     /*
      * For each vif, decide if a copy of the packet should be forwarded.
      * Forward if:
      *		- the ttl exceeds the vif's threshold
      *		- there are group members downstream on interface
      */
     for (vifi = 0; vifi < V_numvifs; vifi++)
 	if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) {
 	    V_viftable[vifi].v_pkt_out++;
 	    V_viftable[vifi].v_bytes_out += plen;
 	    if (V_viftable[vifi].v_flags & VIFF_REGISTER)
 		pim_register_send(ip, V_viftable + vifi, m, rt);
 	    else
 		phyint_send(ip, V_viftable + vifi, m);
 	}
 
     /*
      * Perform upcall-related bw measuring.
      */
     if ((rt->mfc_bw_meter_geq != NULL) || (rt->mfc_bw_meter_leq != NULL)) {
 	struct bw_meter *x;
 	struct timeval now;
 
 	microtime(&now);
 	/* Process meters for Greater-or-EQual case */
 	for (x = rt->mfc_bw_meter_geq; x != NULL; x = x->bm_mfc_next)
 		bw_meter_geq_receive_packet(x, plen, &now);
 
 	/* Process meters for Lower-or-EQual case */
 	for (x = rt->mfc_bw_meter_leq; x != NULL; x = x->bm_mfc_next) {
 		/*
 		 * Record that a packet is received.
 		 * Spin lock has to be taken as callout context
 		 * (expire_bw_meter_leq) might modify these fields
 		 * as well
 		 */
 		mtx_lock_spin(&x->bm_spin);
 		x->bm_measured.b_packets++;
 		x->bm_measured.b_bytes += plen;
 		mtx_unlock_spin(&x->bm_spin);
 	}
     }
 
     return 0;
 }
 
 /*
  * Check if a vif number is legal/ok. This is used by in_mcast.c.
  */
 static int
 X_legal_vif_num(int vif)
 {
 	int ret;
 
 	ret = 0;
 	if (vif < 0)
 		return (ret);
 
 	MRW_RLOCK();
 	if (vif < V_numvifs)
 		ret = 1;
 	MRW_RUNLOCK();
 
 	return (ret);
 }
 
 /*
  * Return the local address used by this vif
  */
 static u_long
 X_ip_mcast_src(int vifi)
 {
 	in_addr_t addr;
 
 	addr = INADDR_ANY;
 	if (vifi < 0)
 		return (addr);
 
 	MRW_RLOCK();
 	if (vifi < V_numvifs)
 		addr = V_viftable[vifi].v_lcl_addr.s_addr;
 	MRW_RUNLOCK();
 
 	return (addr);
 }
 
 static void
 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
 {
     struct mbuf *mb_copy;
     int hlen = ip->ip_hl << 2;
 
     MRW_LOCK_ASSERT();
 
     /*
      * Make a new reference to the packet; make sure that
      * the IP header is actually copied, not just referenced,
      * so that ip_output() only scribbles on the copy.
      */
     mb_copy = m_copypacket(m, M_NOWAIT);
     if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen))
 	mb_copy = m_pullup(mb_copy, hlen);
     if (mb_copy == NULL)
 	return;
 
     send_packet(vifp, mb_copy);
 }
 
 static void
 send_packet(struct vif *vifp, struct mbuf *m)
 {
 	struct ip_moptions imo;
 	int error __unused;
 
 	MRW_LOCK_ASSERT();
 	NET_EPOCH_ASSERT();
 
 	imo.imo_multicast_ifp  = vifp->v_ifp;
 	imo.imo_multicast_ttl  = mtod(m, struct ip *)->ip_ttl - 1;
 	imo.imo_multicast_loop = !!in_mcast_loop;
 	imo.imo_multicast_vif  = -1;
 	STAILQ_INIT(&imo.imo_head);
 
 	/*
 	 * Re-entrancy should not be a problem here, because
 	 * the packets that we send out and are looped back at us
 	 * should get rejected because they appear to come from
 	 * the loopback interface, thus preventing looping.
 	 */
 	error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL);
 	CTR3(KTR_IPMF, "%s: vif %td err %d", __func__,
 	    (ptrdiff_t)(vifp - V_viftable), error);
 }
 
 /*
  * Stubs for old RSVP socket shim implementation.
  */
 
 static int
 X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused)
 {
 
 	return (EOPNOTSUPP);
 }
 
 static void
 X_ip_rsvp_force_done(struct socket *so __unused)
 {
 
 }
 
 static int
 X_rsvp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m;
 
 	m = *mp;
 	*mp = NULL;
 	if (!V_rsvp_on)
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Code for bandwidth monitors
  */
 
 /*
  * Define common interface for timeval-related methods
  */
 #define	BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp)
 #define	BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp))
 #define	BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp))
 
 static uint32_t
 compute_bw_meter_flags(struct bw_upcall *req)
 {
     uint32_t flags = 0;
 
     if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
 	flags |= BW_METER_UNIT_PACKETS;
     if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
 	flags |= BW_METER_UNIT_BYTES;
     if (req->bu_flags & BW_UPCALL_GEQ)
 	flags |= BW_METER_GEQ;
     if (req->bu_flags & BW_UPCALL_LEQ)
 	flags |= BW_METER_LEQ;
 
     return flags;
 }
 
 static void
 expire_bw_meter_leq(void *arg)
 {
 	struct bw_meter *x = arg;
 	struct timeval now;
 	/*
 	 * INFO:
 	 * callout is always executed with MRW_WLOCK taken
 	 */
 
 	CURVNET_SET((struct vnet *)x->arg);
 
 	microtime(&now);
 
 	/*
 	 * Test if we should deliver an upcall
 	 */
 	if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 	    (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
 	    ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 	    (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
 		/* Prepare an upcall for delivery */
 		bw_meter_prepare_upcall(x, &now);
 	}
 
 	/* Send all upcalls that are pending delivery */
 	taskqueue_enqueue(V_task_queue, &V_task);
 
 	/* Reset counters */
 	x->bm_start_time = now;
 	/* Spin lock has to be taken as ip_forward context
 	 * might modify these fields as well
 	 */
 	mtx_lock_spin(&x->bm_spin);
 	x->bm_measured.b_bytes = 0;
 	x->bm_measured.b_packets = 0;
 	mtx_unlock_spin(&x->bm_spin);
 
 	callout_schedule(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time));
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Add a bw_meter entry
  */
 static int
 add_bw_upcall(struct bw_upcall *req)
 {
 	struct mfc *mfc;
 	struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
 	BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
 	struct timeval now;
 	struct bw_meter *x, **bwm_ptr;
 	uint32_t flags;
 
 	if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
 		return EOPNOTSUPP;
 
 	/* Test if the flags are valid */
 	if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
 		return EINVAL;
 	if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
 		return EINVAL;
 	if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 			== (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
 		return EINVAL;
 
 	/* Test if the threshold time interval is valid */
 	if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
 		return EINVAL;
 
 	flags = compute_bw_meter_flags(req);
 
 	/*
 	 * Find if we have already same bw_meter entry
 	 */
 	MRW_WLOCK();
 	mfc = mfc_find(&req->bu_src, &req->bu_dst);
 	if (mfc == NULL) {
 		MRW_WUNLOCK();
 		return EADDRNOTAVAIL;
 	}
 
 	/* Choose an appropriate bw_meter list */
 	if (req->bu_flags & BW_UPCALL_GEQ)
 		bwm_ptr = &mfc->mfc_bw_meter_geq;
 	else
 		bwm_ptr = &mfc->mfc_bw_meter_leq;
 
 	for (x = *bwm_ptr; x != NULL; x = x->bm_mfc_next) {
 		if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 		    &req->bu_threshold.b_time, ==))
 		    && (x->bm_threshold.b_packets
 		    == req->bu_threshold.b_packets)
 		    && (x->bm_threshold.b_bytes
 		    == req->bu_threshold.b_bytes)
 		    && (x->bm_flags & BW_METER_USER_FLAGS)
 		    == flags) {
 			MRW_WUNLOCK();
 			return 0; /* XXX Already installed */
 		}
 	}
 
 	/* Allocate the new bw_meter entry */
 	x = (struct bw_meter*) malloc(sizeof(*x), M_BWMETER,
 	    M_ZERO | M_NOWAIT);
 	if (x == NULL) {
 		MRW_WUNLOCK();
 		return ENOBUFS;
 	}
 
 	/* Set the new bw_meter entry */
 	x->bm_threshold.b_time = req->bu_threshold.b_time;
 	microtime(&now);
 	x->bm_start_time = now;
 	x->bm_threshold.b_packets = req->bu_threshold.b_packets;
 	x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
 	x->bm_measured.b_packets = 0;
 	x->bm_measured.b_bytes = 0;
 	x->bm_flags = flags;
 	x->bm_time_next = NULL;
 	x->bm_mfc = mfc;
 	x->arg = curvnet;
 	sprintf(x->bm_spin_name, "BM spin %p", x);
 	mtx_init(&x->bm_spin, x->bm_spin_name, NULL, MTX_SPIN);
 
 	/* For LEQ case create periodic callout */
 	if (req->bu_flags & BW_UPCALL_LEQ) {
 		callout_init_rw(&x->bm_meter_callout, &mrouter_mtx, CALLOUT_SHAREDLOCK);
 		callout_reset(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time),
 		    expire_bw_meter_leq, x);
 	}
 
 	/* Add the new bw_meter entry to the front of entries for this MFC */
 	x->bm_mfc_next = *bwm_ptr;
 	*bwm_ptr = x;
 
 	MRW_WUNLOCK();
 
 	return 0;
 }
 
 static void
 free_bw_list(struct bw_meter *list)
 {
     while (list != NULL) {
 	struct bw_meter *x = list;
 
 	/* MRW_WLOCK must be held here */
 	if (x->bm_flags & BW_METER_LEQ) {
 		callout_drain(&x->bm_meter_callout);
 		mtx_destroy(&x->bm_spin);
 	}
 
 	list = list->bm_mfc_next;
 	free(x, M_BWMETER);
     }
 }
 
 /*
  * Delete one or multiple bw_meter entries
  */
 static int
 del_bw_upcall(struct bw_upcall *req)
 {
     struct mfc *mfc;
     struct bw_meter *x, **bwm_ptr;
 
     if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
 	return EOPNOTSUPP;
 
     MRW_WLOCK();
 
     /* Find the corresponding MFC entry */
     mfc = mfc_find(&req->bu_src, &req->bu_dst);
     if (mfc == NULL) {
 	MRW_WUNLOCK();
 	return EADDRNOTAVAIL;
     } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
 	/*
 	 * Delete all bw_meter entries for this mfc
 	 */
 	struct bw_meter *list;
 
 	/* Free LEQ list */
 	list = mfc->mfc_bw_meter_leq;
 	mfc->mfc_bw_meter_leq = NULL;
 	free_bw_list(list);
 
 	/* Free GEQ list */
 	list = mfc->mfc_bw_meter_geq;
 	mfc->mfc_bw_meter_geq = NULL;
 	free_bw_list(list);
 	MRW_WUNLOCK();
 	return 0;
     } else {			/* Delete a single bw_meter entry */
 	struct bw_meter *prev;
 	uint32_t flags = 0;
 
 	flags = compute_bw_meter_flags(req);
 
 	/* Choose an appropriate bw_meter list */
 	if (req->bu_flags & BW_UPCALL_GEQ)
 		bwm_ptr = &mfc->mfc_bw_meter_geq;
 	else
 		bwm_ptr = &mfc->mfc_bw_meter_leq;
 
 	/* Find the bw_meter entry to delete */
 	for (prev = NULL, x = *bwm_ptr; x != NULL;
 	     prev = x, x = x->bm_mfc_next) {
 	    if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 			       &req->bu_threshold.b_time, ==)) &&
 		(x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
 		(x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
 		(x->bm_flags & BW_METER_USER_FLAGS) == flags)
 		break;
 	}
 	if (x != NULL) { /* Delete entry from the list for this MFC */
 	    if (prev != NULL)
 		prev->bm_mfc_next = x->bm_mfc_next;	/* remove from middle*/
 	    else
 		*bwm_ptr = x->bm_mfc_next;/* new head of list */
 
 	    if (req->bu_flags & BW_UPCALL_LEQ)
 		    callout_stop(&x->bm_meter_callout);
 
 	    MRW_WUNLOCK();
 	    /* Free the bw_meter entry */
 	    free(x, M_BWMETER);
 	    return 0;
 	} else {
 	    MRW_WUNLOCK();
 	    return EINVAL;
 	}
     }
     /* NOTREACHED */
 }
 
 /*
  * Perform bandwidth measurement processing that may result in an upcall
  */
 static void
 bw_meter_geq_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 {
 	struct timeval delta;
 
 	MRW_LOCK_ASSERT();
 
 	delta = *nowp;
 	BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
 	/*
 	 * Processing for ">=" type of bw_meter entry.
 	 * bm_spin does not have to be hold here as in GEQ
 	 * case this is the only context accessing bm_measured.
 	 */
 	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
 	    /* Reset the bw_meter entry */
 	    x->bm_start_time = *nowp;
 	    x->bm_measured.b_packets = 0;
 	    x->bm_measured.b_bytes = 0;
 	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 	}
 
 	/* Record that a packet is received */
 	x->bm_measured.b_packets++;
 	x->bm_measured.b_bytes += plen;
 
 	/*
 	 * Test if we should deliver an upcall
 	 */
 	if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
 		if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
 		    (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
 		    ((x->bm_flags & BW_METER_UNIT_BYTES) &&
 		    (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
 			/* Prepare an upcall for delivery */
 			bw_meter_prepare_upcall(x, nowp);
 			x->bm_flags |= BW_METER_UPCALL_DELIVERED;
 		}
 	}
 }
 
 /*
  * Prepare a bandwidth-related upcall
  */
 static void
 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 {
 	struct timeval delta;
 	struct bw_upcall *u;
 
 	MRW_LOCK_ASSERT();
 
 	/*
 	 * Compute the measured time interval
 	 */
 	delta = *nowp;
 	BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
 	/*
 	 * Set the bw_upcall entry
 	 */
 	u = malloc(sizeof(struct bw_upcall), M_MRTABLE, M_NOWAIT | M_ZERO);
 	if (!u) {
 		log(LOG_WARNING, "bw_meter_prepare_upcall: cannot allocate entry\n");
 		return;
 	}
 	u->bu_src = x->bm_mfc->mfc_origin;
 	u->bu_dst = x->bm_mfc->mfc_mcastgrp;
 	u->bu_threshold.b_time = x->bm_threshold.b_time;
 	u->bu_threshold.b_packets = x->bm_threshold.b_packets;
 	u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
 	u->bu_measured.b_time = delta;
 	u->bu_measured.b_packets = x->bm_measured.b_packets;
 	u->bu_measured.b_bytes = x->bm_measured.b_bytes;
 	u->bu_flags = 0;
 	if (x->bm_flags & BW_METER_UNIT_PACKETS)
 		u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
 	if (x->bm_flags & BW_METER_UNIT_BYTES)
 		u->bu_flags |= BW_UPCALL_UNIT_BYTES;
 	if (x->bm_flags & BW_METER_GEQ)
 		u->bu_flags |= BW_UPCALL_GEQ;
 	if (x->bm_flags & BW_METER_LEQ)
 		u->bu_flags |= BW_UPCALL_LEQ;
 
 	if (buf_ring_enqueue(V_bw_upcalls_ring, u))
 		log(LOG_WARNING, "bw_meter_prepare_upcall: cannot enqueue upcall\n");
 	if (buf_ring_count(V_bw_upcalls_ring) > (BW_UPCALLS_MAX / 2)) {
 		taskqueue_enqueue(V_task_queue, &V_task);
 	}
 }
 /*
  * Send the pending bandwidth-related upcalls
  */
 static void
 bw_upcalls_send(void)
 {
     struct mbuf *m;
     int len = 0;
     struct bw_upcall *bu;
     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
     static struct igmpmsg igmpmsg = { 0,		/* unused1 */
 				      0,		/* unused2 */
 				      IGMPMSG_BW_UPCALL,/* im_msgtype */
 				      0,		/* im_mbz  */
 				      0,		/* im_vif  */
 				      0,		/* unused3 */
 				      { 0 },		/* im_src  */
 				      { 0 } };		/* im_dst  */
 
     MRW_LOCK_ASSERT();
 
     if (buf_ring_empty(V_bw_upcalls_ring))
 	return;
 
     /*
      * Allocate a new mbuf, initialize it with the header and
      * the payload for the pending calls.
      */
     m = m_gethdr(M_NOWAIT, MT_DATA);
     if (m == NULL) {
 	log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
 	return;
     }
 
     m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
     len += sizeof(struct igmpmsg);
     while ((bu = buf_ring_dequeue_mc(V_bw_upcalls_ring)) != NULL) {
 	m_copyback(m, len, sizeof(struct bw_upcall), (caddr_t)bu);
 	len += sizeof(struct bw_upcall);
 	free(bu, M_MRTABLE);
     }
 
     /*
      * Send the upcalls
      * XXX do we need to set the address in k_igmpsrc ?
      */
     MRTSTAT_INC(mrts_upcalls);
     if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) {
 	log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
 	MRTSTAT_INC(mrts_upq_sockfull);
     }
 }
 
 /*
  * A periodic function for sending all upcalls that are pending delivery
  */
 static void
 expire_bw_upcalls_send(void *arg)
 {
     CURVNET_SET((struct vnet *) arg);
 
     /* This callout is run with MRW_RLOCK taken */
 
     bw_upcalls_send();
 
     callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
 	curvnet);
     CURVNET_RESTORE();
 }
 
 /*
  * End of bandwidth monitoring code
  */
 
 /*
  * Send the packet up to the user daemon, or eventually do kernel encapsulation
  *
  */
 static int
 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m,
     struct mfc *rt)
 {
     struct mbuf *mb_copy, *mm;
 
     /*
      * Do not send IGMP_WHOLEPKT notifications to userland, if the
      * rendezvous point was unspecified, and we were told not to.
      */
     if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) &&
 	in_nullhost(rt->mfc_rp))
 	return 0;
 
     mb_copy = pim_register_prepare(ip, m);
     if (mb_copy == NULL)
 	return ENOBUFS;
 
     /*
      * Send all the fragments. Note that the mbuf for each fragment
      * is freed by the sending machinery.
      */
     for (mm = mb_copy; mm; mm = mb_copy) {
 	mb_copy = mm->m_nextpkt;
 	mm->m_nextpkt = 0;
 	mm = m_pullup(mm, sizeof(struct ip));
 	if (mm != NULL) {
 	    ip = mtod(mm, struct ip *);
 	    if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) {
 		pim_register_send_rp(ip, vifp, mm, rt);
 	    } else {
 		pim_register_send_upcall(ip, vifp, mm, rt);
 	    }
 	}
     }
 
     return 0;
 }
 
 /*
  * Return a copy of the data packet that is ready for PIM Register
  * encapsulation.
  * XXX: Note that in the returned copy the IP header is a valid one.
  */
 static struct mbuf *
 pim_register_prepare(struct ip *ip, struct mbuf *m)
 {
     struct mbuf *mb_copy = NULL;
     int mtu;
 
     /* Take care of delayed checksums */
     if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 	in_delayed_cksum(m);
 	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
     }
 
     /*
      * Copy the old packet & pullup its IP header into the
      * new mbuf so we can modify it.
      */
     mb_copy = m_copypacket(m, M_NOWAIT);
     if (mb_copy == NULL)
 	return NULL;
     mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
     if (mb_copy == NULL)
 	return NULL;
 
     /* take care of the TTL */
     ip = mtod(mb_copy, struct ip *);
     --ip->ip_ttl;
 
     /* Compute the MTU after the PIM Register encapsulation */
     mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
 
     if (ntohs(ip->ip_len) <= mtu) {
 	/* Turn the IP header into a valid one */
 	ip->ip_sum = 0;
 	ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
     } else {
 	/* Fragment the packet */
 	mb_copy->m_pkthdr.csum_flags |= CSUM_IP;
 	if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) {
 	    m_freem(mb_copy);
 	    return NULL;
 	}
     }
     return mb_copy;
 }
 
 /*
  * Send an upcall with the data packet to the user-level process.
  */
 static int
 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
     struct mbuf *mb_copy, struct mfc *rt)
 {
     struct mbuf *mb_first;
     int len = ntohs(ip->ip_len);
     struct igmpmsg *im;
     struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
 
     MRW_LOCK_ASSERT();
 
     /*
      * Add a new mbuf with an upcall header
      */
     mb_first = m_gethdr(M_NOWAIT, MT_DATA);
     if (mb_first == NULL) {
 	m_freem(mb_copy);
 	return ENOBUFS;
     }
     mb_first->m_data += max_linkhdr;
     mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
     mb_first->m_len = sizeof(struct igmpmsg);
     mb_first->m_next = mb_copy;
 
     /* Send message to routing daemon */
     im = mtod(mb_first, struct igmpmsg *);
     im->im_msgtype	= IGMPMSG_WHOLEPKT;
     im->im_mbz		= 0;
     im->im_vif		= vifp - V_viftable;
     im->im_src		= ip->ip_src;
     im->im_dst		= ip->ip_dst;
 
     k_igmpsrc.sin_addr	= ip->ip_src;
 
     MRTSTAT_INC(mrts_upcalls);
 
     if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) {
 	CTR1(KTR_IPMF, "%s: socket queue full", __func__);
 	MRTSTAT_INC(mrts_upq_sockfull);
 	return ENOBUFS;
     }
 
     /* Keep statistics */
     PIMSTAT_INC(pims_snd_registers_msgs);
     PIMSTAT_ADD(pims_snd_registers_bytes, len);
 
     return 0;
 }
 
 /*
  * Encapsulate the data packet in PIM Register message and send it to the RP.
  */
 static int
 pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy,
     struct mfc *rt)
 {
     struct mbuf *mb_first;
     struct ip *ip_outer;
     struct pim_encap_pimhdr *pimhdr;
     int len = ntohs(ip->ip_len);
     vifi_t vifi = rt->mfc_parent;
 
     MRW_LOCK_ASSERT();
 
     if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) {
 	m_freem(mb_copy);
 	return EADDRNOTAVAIL;		/* The iif vif is invalid */
     }
 
     /*
      * Add a new mbuf with the encapsulating header
      */
     mb_first = m_gethdr(M_NOWAIT, MT_DATA);
     if (mb_first == NULL) {
 	m_freem(mb_copy);
 	return ENOBUFS;
     }
     mb_first->m_data += max_linkhdr;
     mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
     mb_first->m_next = mb_copy;
 
     mb_first->m_pkthdr.len = len + mb_first->m_len;
 
     /*
      * Fill in the encapsulating IP and PIM header
      */
     ip_outer = mtod(mb_first, struct ip *);
     *ip_outer = pim_encap_iphdr;
     ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
 	sizeof(pim_encap_pimhdr));
     ip_outer->ip_src = V_viftable[vifi].v_lcl_addr;
     ip_outer->ip_dst = rt->mfc_rp;
     /*
      * Copy the inner header TOS to the outer header, and take care of the
      * IP_DF bit.
      */
     ip_outer->ip_tos = ip->ip_tos;
     if (ip->ip_off & htons(IP_DF))
 	ip_outer->ip_off |= htons(IP_DF);
     ip_fillid(ip_outer);
     pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
 					 + sizeof(pim_encap_iphdr));
     *pimhdr = pim_encap_pimhdr;
     /* If the iif crosses a border, set the Border-bit */
     if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config)
 	pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
 
     mb_first->m_data += sizeof(pim_encap_iphdr);
     pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
     mb_first->m_data -= sizeof(pim_encap_iphdr);
 
     send_packet(vifp, mb_first);
 
     /* Keep statistics */
     PIMSTAT_INC(pims_snd_registers_msgs);
     PIMSTAT_ADD(pims_snd_registers_bytes, len);
 
     return 0;
 }
 
 /*
  * pim_encapcheck() is called by the encap4_input() path at runtime to
  * determine if a packet is for PIM; allowing PIM to be dynamically loaded
  * into the kernel.
  */
 static int
 pim_encapcheck(const struct mbuf *m __unused, int off __unused,
     int proto __unused, void *arg __unused)
 {
 
     KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
     return (8);		/* claim the datagram. */
 }
 
 /*
  * PIM-SMv2 and PIM-DM messages processing.
  * Receives and verifies the PIM control messages, and passes them
  * up to the listening socket, using rip_input().
  * The only message with special processing is the PIM_REGISTER message
  * (used by PIM-SM): the PIM header is stripped off, and the inner packet
  * is passed to if_simloop().
  */
 static int
 pim_input(struct mbuf *m, int off, int proto, void *arg __unused)
 {
     struct ip *ip = mtod(m, struct ip *);
     struct pim *pim;
     int iphlen = off;
     int minlen;
     int datalen = ntohs(ip->ip_len) - iphlen;
     int ip_tos;
 
     /* Keep statistics */
     PIMSTAT_INC(pims_rcv_total_msgs);
     PIMSTAT_ADD(pims_rcv_total_bytes, datalen);
 
     /*
      * Validate lengths
      */
     if (datalen < PIM_MINLEN) {
 	PIMSTAT_INC(pims_rcv_tooshort);
 	CTR3(KTR_IPMF, "%s: short packet (%d) from 0x%08x",
 	    __func__, datalen, ntohl(ip->ip_src.s_addr));
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /*
      * If the packet is at least as big as a REGISTER, go agead
      * and grab the PIM REGISTER header size, to avoid another
      * possible m_pullup() later.
      *
      * PIM_MINLEN       == pimhdr + u_int32_t == 4 + 4 = 8
      * PIM_REG_MINLEN   == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
      */
     minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
     /*
      * Get the IP and PIM headers in contiguous memory, and
      * possibly the PIM REGISTER header.
      */
     if (m->m_len < minlen && (m = m_pullup(m, minlen)) == NULL) {
 	CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__);
 	return (IPPROTO_DONE);
     }
 
     /* m_pullup() may have given us a new mbuf so reset ip. */
     ip = mtod(m, struct ip *);
     ip_tos = ip->ip_tos;
 
     /* adjust mbuf to point to the PIM header */
     m->m_data += iphlen;
     m->m_len  -= iphlen;
     pim = mtod(m, struct pim *);
 
     /*
      * Validate checksum. If PIM REGISTER, exclude the data packet.
      *
      * XXX: some older PIMv2 implementations don't make this distinction,
      * so for compatibility reason perform the checksum over part of the
      * message, and if error, then over the whole message.
      */
     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
 	/* do nothing, checksum okay */
     } else if (in_cksum(m, datalen)) {
 	PIMSTAT_INC(pims_rcv_badsum);
 	CTR1(KTR_IPMF, "%s: invalid checksum", __func__);
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /* PIM version check */
     if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
 	PIMSTAT_INC(pims_rcv_badversion);
 	CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__,
 	    (int)PIM_VT_V(pim->pim_vt), PIM_VERSION);
 	m_freem(m);
 	return (IPPROTO_DONE);
     }
 
     /* restore mbuf back to the outer IP */
     m->m_data -= iphlen;
     m->m_len  += iphlen;
 
     if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
 	/*
 	 * Since this is a REGISTER, we'll make a copy of the register
 	 * headers ip + pim + u_int32 + encap_ip, to be passed up to the
 	 * routing daemon.
 	 */
 	struct sockaddr_in dst = { sizeof(dst), AF_INET };
 	struct mbuf *mcp;
 	struct ip *encap_ip;
 	u_int32_t *reghdr;
 	struct ifnet *vifp;
 
 	MRW_RLOCK();
 	if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) {
 	    MRW_RUNLOCK();
 	    CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__,
 		(int)V_reg_vif_num);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 	/* XXX need refcnt? */
 	vifp = V_viftable[V_reg_vif_num].v_ifp;
 	MRW_RUNLOCK();
 
 	/*
 	 * Validate length
 	 */
 	if (datalen < PIM_REG_MINLEN) {
 	    PIMSTAT_INC(pims_rcv_tooshort);
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR1(KTR_IPMF, "%s: register packet size too small", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	reghdr = (u_int32_t *)(pim + 1);
 	encap_ip = (struct ip *)(reghdr + 1);
 
 	CTR3(KTR_IPMF, "%s: register: encap ip src 0x%08x len %d",
 	    __func__, ntohl(encap_ip->ip_src.s_addr),
 	    ntohs(encap_ip->ip_len));
 
 	/* verify the version number of the inner packet */
 	if (encap_ip->ip_v != IPVERSION) {
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR1(KTR_IPMF, "%s: bad encap ip version", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* verify the inner packet is destined to a mcast group */
 	if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) {
 	    PIMSTAT_INC(pims_rcv_badregisters);
 	    CTR2(KTR_IPMF, "%s: bad encap ip dest 0x%08x", __func__,
 		ntohl(encap_ip->ip_dst.s_addr));
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* If a NULL_REGISTER, pass it to the daemon */
 	if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 	    goto pim_input_to_daemon;
 
 	/*
 	 * Copy the TOS from the outer IP header to the inner IP header.
 	 */
 	if (encap_ip->ip_tos != ip_tos) {
 	    /* Outer TOS -> inner TOS */
 	    encap_ip->ip_tos = ip_tos;
 	    /* Recompute the inner header checksum. Sigh... */
 
 	    /* adjust mbuf to point to the inner IP header */
 	    m->m_data += (iphlen + PIM_MINLEN);
 	    m->m_len  -= (iphlen + PIM_MINLEN);
 
 	    encap_ip->ip_sum = 0;
 	    encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
 
 	    /* restore mbuf to point back to the outer IP header */
 	    m->m_data -= (iphlen + PIM_MINLEN);
 	    m->m_len  += (iphlen + PIM_MINLEN);
 	}
 
 	/*
 	 * Decapsulate the inner IP packet and loopback to forward it
 	 * as a normal multicast packet. Also, make a copy of the
 	 *     outer_iphdr + pimhdr + reghdr + encap_iphdr
 	 * to pass to the daemon later, so it can take the appropriate
 	 * actions (e.g., send back PIM_REGISTER_STOP).
 	 * XXX: here m->m_data points to the outer IP header.
 	 */
 	mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_NOWAIT);
 	if (mcp == NULL) {
 	    CTR1(KTR_IPMF, "%s: m_copym() failed", __func__);
 	    m_freem(m);
 	    return (IPPROTO_DONE);
 	}
 
 	/* Keep statistics */
 	/* XXX: registers_bytes include only the encap. mcast pkt */
 	PIMSTAT_INC(pims_rcv_registers_msgs);
 	PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len));
 
 	/*
 	 * forward the inner ip packet; point m_data at the inner ip.
 	 */
 	m_adj(m, iphlen + PIM_MINLEN);
 
 	CTR4(KTR_IPMF,
 	    "%s: forward decap'd REGISTER: src %lx dst %lx vif %d",
 	    __func__,
 	    (u_long)ntohl(encap_ip->ip_src.s_addr),
 	    (u_long)ntohl(encap_ip->ip_dst.s_addr),
 	    (int)V_reg_vif_num);
 
 	/* NB: vifp was collected above; can it change on us? */
 	if_simloop(vifp, m, dst.sin_family, 0);
 
 	/* prepare the register head to send to the mrouting daemon */
 	m = mcp;
     }
 
 pim_input_to_daemon:
     /*
      * Pass the PIM message up to the daemon; if it is a Register message,
      * pass the 'head' only up to the daemon. This includes the
      * outer IP header, PIM header, PIM-Register header and the
      * inner IP header.
      * XXX: the outer IP header pkt size of a Register is not adjust to
      * reflect the fact that the inner multicast data is truncated.
      */
     return (rip_input(&m, &off, proto));
 }
 
 static int
 sysctl_mfctable(SYSCTL_HANDLER_ARGS)
 {
 	struct mfc	*rt;
 	int		 error, i;
 
 	if (req->newptr)
 		return (EPERM);
 	if (V_mfchashtbl == NULL)	/* XXX unlocked */
 		return (0);
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 
 	MRW_RLOCK();
 	for (i = 0; i < mfchashsize; i++) {
 		LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) {
 			error = SYSCTL_OUT(req, rt, sizeof(struct mfc));
 			if (error)
 				goto out_locked;
 		}
 	}
 out_locked:
 	MRW_RUNLOCK();
 	return (error);
 }
 
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mfctable,
     "IPv4 Multicast Forwarding Table "
     "(struct *mfc[mfchashsize], netinet/ip_mroute.h)");
 
 static int
 sysctl_viflist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	if (req->newptr)
 		return (EPERM);
 	if (V_viftable == NULL)		/* XXX unlocked */
 		return (0);
 	error = sysctl_wire_old_buffer(req, MROUTE_VIF_SYSCTL_LEN * MAXVIFS);
 	if (error)
 		return (error);
 
 	MRW_RLOCK();
 	/* Copy out user-visible portion of vif entry. */
 	for (i = 0; i < MAXVIFS; i++) {
 		error = SYSCTL_OUT(req, &V_viftable[i], MROUTE_VIF_SYSCTL_LEN);
 		if (error)
 			break;
 	}
 	MRW_RUNLOCK();
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_ip, OID_AUTO, viftable,
     CTLTYPE_OPAQUE | CTLFLAG_VNET | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_viflist, "S,vif[MAXVIFS]",
     "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");
 
 static void
 vnet_mroute_init(const void *unused __unused)
 {
 
 	V_nexpire = malloc(mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO);
 
 	V_viftable = mallocarray(MAXVIFS, sizeof(*V_viftable),
 	    M_MRTABLE, M_WAITOK|M_ZERO);
 
 	callout_init_rw(&V_expire_upcalls_ch, &mrouter_mtx, 0);
 	callout_init_rw(&V_bw_upcalls_ch, &mrouter_mtx, 0);
 
 	/* Prepare taskqueue */
 	V_task_queue = taskqueue_create_fast("ip_mroute_tskq", M_NOWAIT,
 		    taskqueue_thread_enqueue, &V_task_queue);
 	taskqueue_start_threads(&V_task_queue, 1, PI_NET, "ip_mroute_tskq task");
 }
 
 VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init,
 	NULL);
 
 static void
 vnet_mroute_uninit(const void *unused __unused)
 {
 
 	/* Taskqueue should be cancelled and drained before freeing */
 	taskqueue_free(V_task_queue);
 
 	free(V_viftable, M_MRTABLE);
 	free(V_nexpire, M_MRTABLE);
 	V_nexpire = NULL;
 }
 
 VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE,
 	vnet_mroute_uninit, NULL);
 
 static int
 ip_mroute_modevent(module_t mod, int type, void *unused)
 {
 
     switch (type) {
     case MOD_LOAD:
 	MRW_LOCK_INIT();
 
 	if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
 	    if_detached_event, NULL, EVENTHANDLER_PRI_ANY);
 	if (if_detach_event_tag == NULL) {
 		printf("ip_mroute: unable to register "
 		    "ifnet_departure_event handler\n");
 		MRW_LOCK_DESTROY();
 		return (EINVAL);
 	}
 
 	mfchashsize = MFCHASHSIZE;
 	if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) &&
 	    !powerof2(mfchashsize)) {
 		printf("WARNING: %s not a power of 2; using default\n",
 		    "net.inet.ip.mfchashsize");
 		mfchashsize = MFCHASHSIZE;
 	}
 
 	pim_squelch_wholepkt = 0;
 	TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt",
 	    &pim_squelch_wholepkt);
 
 	pim_encap_cookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK);
 	if (pim_encap_cookie == NULL) {
 		printf("ip_mroute: unable to attach pim encap\n");
 		MRW_LOCK_DESTROY();
 		return (EINVAL);
 	}
 
 	ip_mcast_src = X_ip_mcast_src;
 	ip_mforward = X_ip_mforward;
 	ip_mrouter_done = X_ip_mrouter_done;
 	ip_mrouter_get = X_ip_mrouter_get;
 	ip_mrouter_set = X_ip_mrouter_set;
 
 	ip_rsvp_force_done = X_ip_rsvp_force_done;
 	ip_rsvp_vif = X_ip_rsvp_vif;
 
 	legal_vif_num = X_legal_vif_num;
 	mrt_ioctl = X_mrt_ioctl;
 	rsvp_input_p = X_rsvp_input;
 	break;
 
     case MOD_UNLOAD:
 	/*
 	 * Typically module unload happens after the user-level
 	 * process has shutdown the kernel services (the check
 	 * below insures someone can't just yank the module out
 	 * from under a running process).  But if the module is
 	 * just loaded and then unloaded w/o starting up a user
 	 * process we still need to cleanup.
 	 */
 	MRW_WLOCK();
 	if (ip_mrouter_cnt != 0) {
 	    MRW_WUNLOCK();
 	    return (EINVAL);
 	}
 	ip_mrouter_unloading = 1;
 	MRW_WUNLOCK();
 
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
 
 	if (pim_encap_cookie) {
 	    ip_encap_detach(pim_encap_cookie);
 	    pim_encap_cookie = NULL;
 	}
 
 	ip_mcast_src = NULL;
 	ip_mforward = NULL;
 	ip_mrouter_done = NULL;
 	ip_mrouter_get = NULL;
 	ip_mrouter_set = NULL;
 
 	ip_rsvp_force_done = NULL;
 	ip_rsvp_vif = NULL;
 
 	legal_vif_num = NULL;
 	mrt_ioctl = NULL;
 	rsvp_input_p = NULL;
 
 	MRW_LOCK_DESTROY();
 	break;
 
     default:
 	return EOPNOTSUPP;
     }
     return 0;
 }
 
 static moduledata_t ip_mroutemod = {
     "ip_mroute",
     ip_mroute_modevent,
     0
 };
 
 DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE);
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
index 375db580296e..739138a6f791 100644
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@@ -1,1595 +1,1596 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_ratelimit.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/ethernet.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef MBUF_STRESS_TEST
 static int mbuf_frag_size = 0;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
 #endif
 
 static void	ip_mloopback(struct ifnet *, const struct mbuf *, int);
 
 extern int in_mcast_loop;
 
 static inline int
 ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags,
     struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error)
 {
 	struct m_tag *fwd_tag = NULL;
 	struct mbuf *m;
 	struct in_addr odst;
 	struct ip *ip;
 	int pflags = PFIL_OUT;
 
 	m = *mp;
 	ip = mtod(m, struct ip *);
 
 	/* Run through list of hooks for output packets. */
 	odst.s_addr = ip->ip_dst.s_addr;
 	switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, pflags, inp)) {
 	case PFIL_DROPPED:
 		*error = EACCES;
 		/* FALLTHROUGH */
 	case PFIL_CONSUMED:
 		return 1; /* Finished */
 	case PFIL_PASS:
 		*error = 0;
 	}
 	m = *mp;
 	ip = mtod(m, struct ip *);
 
 	/* See if destination IP address was changed by packet filter. */
 	if (odst.s_addr != ip->ip_dst.s_addr) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip_input(). */
 		if (in_localip(ip->ip_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				m->m_pkthdr.csum_flags |=
 					CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			m->m_pkthdr.csum_flags |=
 				CSUM_IP_CHECKED | CSUM_IP_VALID;
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			*error = netisr_queue(NETISR_IP, m);
 			return 1; /* Finished */
 		}
 
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 
 		return -1; /* Reloop */
 	}
 	/* See if fib was changed by packet filter. */
 	if ((*fibnum) != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		*fibnum = M_GETFIB(m);
 		return -1; /* Reloop for FIB change */
 	}
 
 	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			m->m_pkthdr.csum_flags |=
 				CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		m->m_pkthdr.csum_flags |=
 			CSUM_IP_CHECKED | CSUM_IP_VALID;
 
 		*error = netisr_queue(NETISR_IP, m);
 		return 1; /* Finished */
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
 		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 
 		return -1; /* Reloop for CHANGE of dst */
 	}
 
 	return 0;
 }
 
 static int
 ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *gw, struct route *ro, bool stamp_tag)
 {
 #ifdef KERN_TLS
 	struct ktls_session *tls = NULL;
 #endif
 	struct m_snd_tag *mst;
 	int error;
 
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	mst = NULL;
 
 #ifdef KERN_TLS
 	/*
 	 * If this is an unencrypted TLS record, save a reference to
 	 * the record.  This local reference is used to call
 	 * ktls_output_eagain after the mbuf has been freed (thus
 	 * dropping the mbuf's reference) in if_output.
 	 */
 	if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
 		tls = ktls_hold(m->m_next->m_epg_tls);
 		mst = tls->snd_tag;
 
 		/*
 		 * If a TLS session doesn't have a valid tag, it must
 		 * have had an earlier ifp mismatch, so drop this
 		 * packet.
 		 */
 		if (mst == NULL) {
 			m_freem(m);
 			error = EAGAIN;
 			goto done;
 		}
 		/*
 		 * Always stamp tags that include NIC ktls.
 		 */
 		stamp_tag = true;
 	}
 #endif
 #ifdef RATELIMIT
 	if (inp != NULL && mst == NULL) {
 		if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
 		    (inp->inp_snd_tag != NULL &&
 		    inp->inp_snd_tag->ifp != ifp))
 			in_pcboutput_txrtlmt(inp, ifp, m);
 
 		if (inp->inp_snd_tag != NULL)
 			mst = inp->inp_snd_tag;
 	}
 #endif
 	if (stamp_tag && mst != NULL) {
 		KASSERT(m->m_pkthdr.rcvif == NULL,
 		    ("trying to add a send tag to a forwarded packet"));
 		if (mst->ifp != ifp) {
 			m_freem(m);
 			error = EAGAIN;
 			goto done;
 		}
 
 		/* stamp send tag on mbuf */
 		m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
 		m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 	}
 
 	error = (*ifp->if_output)(ifp, m, gw, ro);
 
 done:
 	/* Check for route change invalidating send tags. */
 #ifdef KERN_TLS
 	if (tls != NULL) {
 		if (error == EAGAIN)
 			error = ktls_output_eagain(inp, tls);
 		ktls_free(tls);
 	}
 #endif
 #ifdef RATELIMIT
 	if (error == EAGAIN)
 		in_pcboutput_eagain(inp);
 #endif
 	return (error);
 }
 
 /* rte<>ro_flags translation */
 static inline void
 rt_update_ro_flags(struct route *ro, const struct nhop_object *nh)
 {
 	int nh_flags = nh->nh_flags;
 
 	ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW);
 
 	ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0;
 	ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0;
 	ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0;
 }
 
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route ro is present and has ro_rt initialized, route lookup would be
  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  * then result of route lookup is stored in ro->ro_rt.
  *
  * In the IP forwarding case, the packet will arrive with options already
  * inserted, so must have a NULL opt pointer.
  */
 int
 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
     struct ip_moptions *imo, struct inpcb *inp)
 {
 	struct ip *ip;
 	struct ifnet *ifp = NULL;	/* keep compiler happy */
 	struct mbuf *m0;
 	int hlen = sizeof (struct ip);
 	int mtu = 0;
 	int error = 0;
 	int vlan_pcp = -1;
 	struct sockaddr_in *dst;
 	const struct sockaddr *gw;
 	struct in_ifaddr *ia = NULL;
 	struct in_addr src;
 	int isbroadcast;
 	uint16_t ip_len, ip_off;
 	struct route iproute;
 	uint32_t fibnum;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	int no_route_but_check_spd = 0;
 #endif
 
 	M_ASSERTPKTHDR(m);
 	NET_EPOCH_ASSERT();
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 		if ((inp->inp_flags2 & INP_2PCP_SET) != 0)
 			vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >>
 			    INP_2PCP_SHIFT;
 #ifdef NUMA
 		m->m_pkthdr.numa_domain = inp->inp_numa_domain;
 #endif
 	}
 
 	if (opt) {
 		int len = 0;
 		m = ip_insertoptions(m, opt, &len);
 		if (len != 0)
 			hlen = len; /* ip->ip_hl is updated above */
 	}
 	ip = mtod(m, struct ip *);
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = hlen >> 2;
 		ip_fillid(ip);
 	} else {
 		/* Header already set, fetch hlen from there */
 		hlen = ip->ip_hl << 2;
 	}
 	if ((flags & IP_FORWARDING) == 0)
 		IPSTAT_INC(ips_localout);
 
 	/*
 	 * dst/gw handling:
 	 *
 	 * gw is readonly but can point either to dst OR rt_gateway,
 	 * therefore we need restore gw if we're redoing lookup.
 	 */
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 	if (ro == NULL) {
 		ro = &iproute;
 		bzero(ro, sizeof (*ro));
 	}
 	dst = (struct sockaddr_in *)&ro->ro_dst;
 	if (ro->ro_nh == NULL) {
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 	}
 	gw = (const struct sockaddr *)dst;
 again:
 	/*
 	 * Validate route against routing table additions;
 	 * a better/more specific route might have been added.
 	 */
 	if (inp != NULL && ro->ro_nh != NULL)
 		NH_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
 	/*
 	 * If there is a cached route,
 	 * check that it is to the same destination
 	 * and is still up.  If not, free it and try again.
 	 * The address family should also be checked in case of sharing the
 	 * cache with IPv6.
 	 * Also check whether routing cache needs invalidation.
 	 */
 	if (ro->ro_nh != NULL &&
 	    ((!NH_IS_VALID(ro->ro_nh)) || dst->sin_family != AF_INET ||
 	    dst->sin_addr.s_addr != ip->ip_dst.s_addr))
 		RO_INVALIDATE_CACHE(ro);
 	ia = NULL;
 	/*
 	 * If routing to interface only, short circuit routing lookup.
 	 * The use of an all-ones broadcast address implies this; an
 	 * interface is specified by the broadcast address of an interface,
 	 * or the destination address of a ptp interface.
 	 */
 	if (flags & IP_SENDONES) {
 		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
 						      M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ip->ip_dst.s_addr = INADDR_BROADCAST;
 		dst->sin_addr = ip->ip_dst;
 		ifp = ia->ia_ifp;
 		mtu = ifp->if_mtu;
 		ip->ip_ttl = 1;
 		isbroadcast = 1;
 		src = IA_SIN(ia)->sin_addr;
 	} else if (flags & IP_ROUTETOIF) {
 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
 						M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ifp = ia->ia_ifp;
 		mtu = ifp->if_mtu;
 		ip->ip_ttl = 1;
 		isbroadcast = ifp->if_flags & IFF_BROADCAST ?
 		    in_ifaddr_broadcast(dst->sin_addr, ia) : 0;
 		src = IA_SIN(ia)->sin_addr;
 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
 		/*
 		 * Bypass the normal routing lookup for multicast
 		 * packets if the interface is specified.
 		 */
 		ifp = imo->imo_multicast_ifp;
 		mtu = ifp->if_mtu;
 		IFP_TO_IA(ifp, ia);
 		isbroadcast = 0;	/* fool gcc */
 		/* Interface may have no addresses. */
 		if (ia != NULL)
 			src = IA_SIN(ia)->sin_addr;
 		else
 			src.s_addr = INADDR_ANY;
 	} else if (ro != &iproute) {
 		if (ro->ro_nh == NULL) {
 			/*
 			 * We want to do any cloning requested by the link
 			 * layer, as this is probably required in all cases
 			 * for correct operation (as it is for ARP).
 			 */
 			uint32_t flowid;
 			flowid = m->m_pkthdr.flowid;
 			ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0,
 			    NHR_REF, flowid);
 
 			if (ro->ro_nh == NULL || (!NH_IS_VALID(ro->ro_nh))) {
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 				/*
 				 * There is no route for this packet, but it is
 				 * possible that a matching SPD entry exists.
 				 */
 				no_route_but_check_spd = 1;
 				goto sendit;
 #endif
 				IPSTAT_INC(ips_noroute);
 				error = EHOSTUNREACH;
 				goto bad;
 			}
 		}
 		struct nhop_object *nh = ro->ro_nh;
 
 		ia = ifatoia(nh->nh_ifa);
 		ifp = nh->nh_ifp;
 		counter_u64_add(nh->nh_pksent, 1);
 		rt_update_ro_flags(ro, nh);
 		if (nh->nh_flags & NHF_GATEWAY)
 			gw = &nh->gw_sa;
 		if (nh->nh_flags & NHF_HOST)
 			isbroadcast = (nh->nh_flags & NHF_BROADCAST);
 		else if ((ifp->if_flags & IFF_BROADCAST) && (gw->sa_family == AF_INET))
 			isbroadcast = in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia);
 		else
 			isbroadcast = 0;
 		mtu = nh->nh_mtu;
 		src = IA_SIN(ia)->sin_addr;
 	} else {
 		struct nhop_object *nh;
 
 		nh = fib4_lookup(M_GETFIB(m), dst->sin_addr, 0, NHR_NONE,
 		    m->m_pkthdr.flowid);
 		if (nh == NULL) {
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			/*
 			 * There is no route for this packet, but it is
 			 * possible that a matching SPD entry exists.
 			 */
 			no_route_but_check_spd = 1;
 			goto sendit;
 #endif
 			IPSTAT_INC(ips_noroute);
 			error = EHOSTUNREACH;
 			goto bad;
 		}
 		ifp = nh->nh_ifp;
 		mtu = nh->nh_mtu;
 		rt_update_ro_flags(ro, nh);
 		if (nh->nh_flags & NHF_GATEWAY)
 			gw = &nh->gw_sa;
 		ia = ifatoia(nh->nh_ifa);
 		src = IA_SIN(ia)->sin_addr;
 		isbroadcast = (((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) ==
 		    (NHF_HOST | NHF_BROADCAST)) ||
 		    ((ifp->if_flags & IFF_BROADCAST) &&
 		    (gw->sa_family == AF_INET) &&
 		    in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia)));
 	}
 
 	/* Catch a possible divide by zero later. */
 	KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p",
 	    __func__, mtu, ro,
 	    (ro != NULL && ro->ro_nh != NULL) ? ro->ro_nh->nh_flags : 0, ifp));
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		m->m_flags |= M_MCAST;
 		/*
 		 * IP destination address is multicast.  Make sure "gw"
 		 * still points to the address in "ro".  (It may have been
 		 * changed to point to a gateway address, above.)
 		 */
 		gw = (const struct sockaddr *)dst;
 		/*
 		 * See if the caller provided any multicast options
 		 */
 		if (imo != NULL) {
 			ip->ip_ttl = imo->imo_multicast_ttl;
 			if (imo->imo_multicast_vif != -1)
 				ip->ip_src.s_addr =
 				    ip_mcast_src ?
 				    ip_mcast_src(imo->imo_multicast_vif) :
 				    INADDR_ANY;
 		} else
 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 				IPSTAT_INC(ips_noroute);
 				error = ENETUNREACH;
 				goto bad;
 			}
 		}
 		/*
 		 * If source address not specified yet, use address
 		 * of outgoing interface.
 		 */
 		if (ip->ip_src.s_addr == INADDR_ANY)
 			ip->ip_src = src;
 
 		if ((imo == NULL && in_mcast_loop) ||
 		    (imo && imo->imo_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we are not a member
 			 * of the group; ip_input() will filter it later,
 			 * thus deferring a hash lookup and mutex acquisition
 			 * at the expense of a cheap copy using m_copym().
 			 */
 			ip_mloopback(ifp, m, hlen);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IP_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip_mloopback(),
 			 * above, will be forwarded by the ip_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
 				/*
 				 * If rsvp daemon is not running, do not
 				 * set ip_moptions. This ensures that the packet
 				 * is multicast and not just sent down one link
 				 * as prescribed by rsvpd.
 				 */
 				if (!V_rsvp_on)
 					imo = NULL;
 				if (ip_mforward &&
 				    ip_mforward(ip, ifp, m, imo) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 
 		/*
 		 * Multicasts with a time-to-live of zero may be looped-
 		 * back, above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip_mloopback() will
 		 * loop back a copy. ip_input() will drop the copy if
 		 * this host does not belong to the destination group on
 		 * the loopback interface.
 		 */
 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 			m_freem(m);
 			goto done;
 		}
 
 		goto sendit;
 	}
 
 	/*
 	 * If the source address is not specified yet, use the address
 	 * of the outoing interface.
 	 */
 	if (ip->ip_src.s_addr == INADDR_ANY)
 		ip->ip_src = src;
 
 	/*
 	 * Look for broadcast address and
 	 * verify user is allowed to send
 	 * such a packet.
 	 */
 	if (isbroadcast) {
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 		if ((flags & IP_ALLOWBROADCAST) == 0) {
 			error = EACCES;
 			goto bad;
 		}
 		/* don't allow broadcast messages to be fragmented */
 		if (ip_len > mtu) {
 			error = EMSGSIZE;
 			goto bad;
 		}
 		m->m_flags |= M_BCAST;
 	} else {
 		m->m_flags &= ~M_BCAST;
 	}
 
 sendit:
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (IPSEC_ENABLED(ipv4)) {
 		if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) {
 			if (error == EINPROGRESS)
 				error = 0;
 			goto done;
 		}
 	}
 	/*
 	 * Check if there was a route for this packet; return error if not.
 	 */
 	if (no_route_but_check_spd) {
 		IPSTAT_INC(ips_noroute);
 		error = EHOSTUNREACH;
 		goto bad;
 	}
 	/* Update variables that are affected by ipsec4_output(). */
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 #endif /* IPSEC */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (PFIL_HOOKED_OUT(V_inet_pfil_head)) {
 		switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum,
 		    &error)) {
 		case 1: /* Finished */
 			goto done;
 
 		case 0: /* Continue normally */
 			ip = mtod(m, struct ip *);
 			break;
 
 		case -1: /* Need to try again */
 			/* Reset everything for a new round */
 			if (ro != NULL) {
 				RO_NHFREE(ro);
 				ro->ro_prepend = NULL;
 			}
 			gw = (const struct sockaddr *)dst;
 			ip = mtod(m, struct ip *);
 			goto again;
 		}
 	}
 
 	if (vlan_pcp > -1)
 		EVL_APPLY_PRI(m, vlan_pcp);
 
 	/* IN_LOOPBACK must not appear on the wire - RFC1122. */
 	if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) ||
 	    IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 	}
 
 	/* Ensure the packet data is mapped if the interface requires it. */
 	if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
 		m = mb_unmapped_to_ext(m);
 		if (m == NULL) {
 			IPSTAT_INC(ips_odropped);
 			error = ENOBUFS;
 			goto bad;
 		}
 	}
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, we can just send directly.
 	 * Note that if_vxlan could have requested TSO even though the outer
 	 * frame is UDP.  It is correct to not fragment such datagrams and
 	 * instead just pass them on to the driver.
 	 */
 	if (ip_len <= mtu ||
 	    (m->m_pkthdr.csum_flags & ifp->if_hwassist &
 	    (CSUM_TSO | CSUM_INNER_TSO)) != 0) {
 		ip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
 			ip->ip_sum = in_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 
 		/*
 		 * Record statistics for this interface address.
 		 * With CSUM_TSO the byte/packet count will be slightly
 		 * incorrect because we count the IP+TCP headers only
 		 * once instead of for every generated packet.
 		 */
 		if (!(flags & IP_FORWARDING) && ia) {
 			if (m->m_pkthdr.csum_flags &
 			    (CSUM_TSO | CSUM_INNER_TSO))
 				counter_u64_add(ia->ia_ifa.ifa_opackets,
 				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
 			else
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 
 			counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
 		}
 #ifdef MBUF_STRESS_TEST
 		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
 			m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
 #endif
 		/*
 		 * Reset layer specific mbuf flags
 		 * to avoid confusing lower layers.
 		 */
 		m_clrprotoflags(m);
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 		error = ip_output_send(inp, ifp, m, gw, ro,
 		    (flags & IP_NO_SND_TAG_RL) ? false : true);
 		goto done;
 	}
 
 	/* Balk when DF bit is set or the interface didn't support TSO. */
 	if ((ip_off & IP_DF) ||
 	    (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) {
 		error = EMSGSIZE;
 		IPSTAT_INC(ips_cantfrag);
 		goto bad;
 	}
 
 	/*
 	 * Too large for interface; fragment if possible. If successful,
 	 * on return, m will point to a list of packets to be sent.
 	 */
 	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
 	if (error)
 		goto bad;
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia != NULL) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			/*
 			 * Reset layer specific mbuf flags
 			 * to avoid confusing upper layers.
 			 */
 			m_clrprotoflags(m);
 
 			IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
 			    mtod(m, struct ip *), NULL);
 			error = ip_output_send(inp, ifp, m, gw, ro, true);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IPSTAT_INC(ips_fragmented);
 
 done:
 	return (error);
  bad:
 	m_freem(m);
 	goto done;
 }
 
 /*
  * Create a chain of fragments which fit the given mtu. m_frag points to the
  * mbuf to be fragmented; on return it points to the chain with the fragments.
  * Return 0 if no error. If error, m_frag may contain a partially built
  * chain of fragments that should be freed by the caller.
  *
  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
  */
 int
 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
     u_long if_hwassist_flags)
 {
 	int error = 0;
 	int hlen = ip->ip_hl << 2;
 	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
 	int off;
 	struct mbuf *m0 = *m_frag;	/* the original packet		*/
 	int firstlen;
 	struct mbuf **mnext;
 	int nfrags;
 	uint16_t ip_len, ip_off;
 
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	/*
 	 * Packet shall not have "Don't Fragment" flag and have at least 8
 	 * bytes of payload.
 	 */
 	if (__predict_false((ip_off & IP_DF) || len < 8)) {
 		IPSTAT_INC(ips_cantfrag);
 		return (EMSGSIZE);
 	}
 
 	/*
 	 * If the interface will not calculate checksums on
 	 * fragmented packets, then do it here.
 	 */
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
 		sctp_delayed_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 	if (len > PAGE_SIZE) {
 		/*
 		 * Fragment large datagrams such that each segment
 		 * contains a multiple of PAGE_SIZE amount of data,
 		 * plus headers. This enables a receiver to perform
 		 * page-flipping zero-copy optimizations.
 		 *
 		 * XXX When does this help given that sender and receiver
 		 * could have different page sizes, and also mtu could
 		 * be less than the receiver's page size ?
 		 */
 		int newlen;
 
 		off = MIN(mtu, m0->m_pkthdr.len);
 
 		/*
 		 * firstlen (off - hlen) must be aligned on an
 		 * 8-byte boundary
 		 */
 		if (off < hlen)
 			goto smart_frag_failure;
 		off = ((off - hlen) & ~7) + hlen;
 		newlen = (~PAGE_MASK) & mtu;
 		if ((newlen + sizeof (struct ip)) > mtu) {
 			/* we failed, go back the default */
 smart_frag_failure:
 			newlen = len;
 			off = hlen + len;
 		}
 		len = newlen;
 
 	} else {
 		off = hlen + len;
 	}
 
 	firstlen = off - hlen;
 	mnext = &m0->m_nextpkt;		/* pointer to next packet */
 
 	/*
 	 * Loop through length of segment after first fragment,
 	 * make new header and copy data of each part and link onto chain.
 	 * Here, m0 is the original packet, m is the fragment being created.
 	 * The fragments are linked off the m_nextpkt of the original
 	 * packet, which after processing serves as the first fragment.
 	 */
 	for (nfrags = 1; off < ip_len; off += len, nfrags++) {
 		struct ip *mhip;	/* ip header on the fragment */
 		struct mbuf *m;
 		int mhlen = sizeof (struct ip);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * Make sure the complete packet header gets copied
 		 * from the originating mbuf to the newly created
 		 * mbuf. This also ensures that existing firewall
 		 * classification(s), VLAN tags and so on get copied
 		 * to the resulting fragmented packet(s):
 		 */
 		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
 			m_free(m);
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * In the first mbuf, leave room for the link header, then
 		 * copy the original IP header including options. The payload
 		 * goes into an additional mbuf chain returned by m_copym().
 		 */
 		m->m_data += max_linkhdr;
 		mhip = mtod(m, struct ip *);
 		*mhip = *ip;
 		if (hlen > sizeof (struct ip)) {
 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
 			mhip->ip_v = IPVERSION;
 			mhip->ip_hl = mhlen >> 2;
 		}
 		m->m_len = mhlen;
 		/* XXX do we need to add ip_off below ? */
 		mhip->ip_off = ((off - hlen) >> 3) + ip_off;
 		if (off + len >= ip_len)
 			len = ip_len - off;
 		else
 			mhip->ip_off |= IP_MF;
 		mhip->ip_len = htons((u_short)(len + mhlen));
 		m->m_next = m_copym(m0, off, len, M_NOWAIT);
 		if (m->m_next == NULL) {	/* copy failed */
 			m_free(m);
 			error = ENOBUFS;	/* ??? */
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		m->m_pkthdr.len = mhlen + len;
 #ifdef MAC
 		mac_netinet_fragment(m0, m);
 #endif
 		mhip->ip_off = htons(mhip->ip_off);
 		mhip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 			mhip->ip_sum = in_cksum(m, mhlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 	}
 	IPSTAT_ADD(ips_ofragments, nfrags);
 
 	/*
 	 * Update first fragment by trimming what's been copied out
 	 * and updating header.
 	 */
 	m_adj(m0, hlen + firstlen - ip_len);
 	m0->m_pkthdr.len = hlen + firstlen;
 	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
 	ip->ip_off = htons(ip_off | IP_MF);
 	ip->ip_sum = 0;
 	if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 		ip->ip_sum = in_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 	}
 
 done:
 	*m_frag = m0;
 	return error;
 }
 
 void
 in_delayed_cksum(struct mbuf *m)
 {
 	struct ip *ip;
 	struct udphdr *uh;
 	uint16_t cklen, csum, offset;
 
 	ip = mtod(m, struct ip *);
 	offset = ip->ip_hl << 2 ;
 
 	if (m->m_pkthdr.csum_flags & CSUM_UDP) {
 		/* if udp header is not in the first mbuf copy udplen */
 		if (offset + sizeof(struct udphdr) > m->m_len) {
 			m_copydata(m, offset + offsetof(struct udphdr,
 			    uh_ulen), sizeof(cklen), (caddr_t)&cklen);
 			cklen = ntohs(cklen);
 		} else {
 			uh = (struct udphdr *)mtodo(m, offset);
 			cklen = ntohs(uh->uh_ulen);
 		}
 		csum = in_cksum_skip(m, cklen + offset, offset);
 		if (csum == 0)
 			csum = 0xffff;
 	} else {
 		cklen = ntohs(ip->ip_len);
 		csum = in_cksum_skip(m, cklen, offset);
 	}
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(csum) > m->m_len)
 		m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
 	else
 		*(u_short *)mtodo(m, offset) = csum;
 }
 
 /*
  * IP socket option processing.
  */
 int
 ip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 	error = optval = 0;
 	if (sopt->sopt_level != IPPROTO_IP) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					inp->inp_flags2 |= INP_REUSEADDR;
 				else
 					inp->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT_LB:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT_LB) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT_LB;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT_LB;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_MAX_PACING_RATE:
 #ifdef RATELIMIT
 				INP_WLOCK(inp);
 				inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 				INP_WUNLOCK(inp);
 				error = 0;
 #else
 				error = EOPNOTSUPP;
 #endif
 				break;
 			default:
 				break;
 			}
 		}
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 #ifdef notyet
 		case IP_RETOPTS:
 #endif
 		{
 			struct mbuf *m;
 			if (sopt->sopt_valsize > MLEN) {
 				error = EMSGSIZE;
 				break;
 			}
 			m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 			if (m == NULL) {
 				error = ENOBUFS;
 				break;
 			}
 			m->m_len = sopt->sopt_valsize;
 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
 					    m->m_len);
 			if (error) {
 				m_free(m);
 				break;
 			}
 			INP_WLOCK(inp);
 			error = ip_pcbopts(inp, sopt->sopt_name, m);
 			INP_WUNLOCK(inp);
 			return (error);
 		}
 
 		case IP_BINDANY:
 			if (sopt->sopt_td != NULL) {
 				error = priv_check(sopt->sopt_td,
 				    PRIV_NETINET_BINDANY);
 				if (error)
 					break;
 			}
 			/* FALLTHROUGH */
 		case IP_BINDMULTI:
 #ifdef	RSS
 		case IP_RSS_LISTEN_BUCKET:
 #endif
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_ORIGDSTADDR:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_RECVTOS:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RECVRSSBUCKETID:
 #endif
 		case IP_VLAN_PCP:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos = optval;
 				break;
 
 			case IP_TTL:
 				inp->inp_ip_ttl = optval;
 				break;
 
 			case IP_MINTTL:
 				if (optval >= 0 && optval <= MAXTTL)
 					inp->inp_ip_minttl = optval;
 				else
 					error = EINVAL;
 				break;
 
 #define	OPTSET(bit) do {						\
 	INP_WLOCK(inp);							\
 	if (optval)							\
 		inp->inp_flags |= bit;					\
 	else								\
 		inp->inp_flags &= ~bit;					\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 #define	OPTSET2(bit, val) do {						\
 	INP_WLOCK(inp);							\
 	if (val)							\
 		inp->inp_flags2 |= bit;					\
 	else								\
 		inp->inp_flags2 &= ~bit;				\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 			case IP_RECVOPTS:
 				OPTSET(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				OPTSET(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				OPTSET(INP_RECVDSTADDR);
 				break;
 
 			case IP_ORIGDSTADDR:
 				OPTSET2(INP_ORIGDSTADDR, optval);
 				break;
 
 			case IP_RECVTTL:
 				OPTSET(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				OPTSET(INP_RECVIF);
 				break;
 
 			case IP_ONESBCAST:
 				OPTSET(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				OPTSET(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				OPTSET(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				OPTSET(INP_RECVTOS);
 				break;
 			case IP_BINDMULTI:
 				OPTSET2(INP_BINDMULTI, optval);
 				break;
 			case IP_RECVFLOWID:
 				OPTSET2(INP_RECVFLOWID, optval);
 				break;
 #ifdef	RSS
 			case IP_RSS_LISTEN_BUCKET:
 				if ((optval >= 0) &&
 				    (optval < rss_getnumbuckets())) {
 					inp->inp_rss_listen_bucket = optval;
 					OPTSET2(INP_RSS_BUCKET_SET, 1);
 				} else {
 					error = EINVAL;
 				}
 				break;
 			case IP_RECVRSSBUCKETID:
 				OPTSET2(INP_RECVRSSBUCKETID, optval);
 				break;
 #endif
 			case IP_VLAN_PCP:
 				if ((optval >= -1) && (optval <=
 				    (INP_2PCP_MASK >> INP_2PCP_SHIFT))) {
 					if (optval == -1) {
 						INP_WLOCK(inp);
 						inp->inp_flags2 &=
 						    ~(INP_2PCP_SET |
 						      INP_2PCP_MASK);
 						INP_WUNLOCK(inp);
 					} else {
 						INP_WLOCK(inp);
 						inp->inp_flags2 |=
 						    INP_2PCP_SET;
 						inp->inp_flags2 &=
 						    ~INP_2PCP_MASK;
 						inp->inp_flags2 |=
 						    optval << INP_2PCP_SHIFT;
 						INP_WUNLOCK(inp);
 					}
 				} else
 					error = EINVAL;
 				break;
 			}
 			break;
 #undef OPTSET
 #undef OPTSET2
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_ADD_MEMBERSHIP:
 		case IP_DROP_MEMBERSHIP:
 		case IP_ADD_SOURCE_MEMBERSHIP:
 		case IP_DROP_SOURCE_MEMBERSHIP:
 		case IP_BLOCK_SOURCE:
 		case IP_UNBLOCK_SOURCE:
 		case IP_MSFILTER:
 		case MCAST_JOIN_GROUP:
 		case MCAST_LEAVE_GROUP:
 		case MCAST_JOIN_SOURCE_GROUP:
 		case MCAST_LEAVE_SOURCE_GROUP:
 		case MCAST_BLOCK_SOURCE:
 		case MCAST_UNBLOCK_SOURCE:
 			error = inp_setmoptions(inp, sopt);
 			break;
 
 		case IP_PORTRANGE:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			INP_WLOCK(inp);
 			switch (optval) {
 			case IP_PORTRANGE_DEFAULT:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				break;
 
 			case IP_PORTRANGE_HIGH:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags |= INP_HIGHPORT;
 				break;
 
 			case IP_PORTRANGE_LOW:
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				inp->inp_flags |= INP_LOWPORT;
 				break;
 
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		case IP_IPSEC_POLICY:
 			if (IPSEC_ENABLED(ipv4)) {
 				error = IPSEC_PCBCTL(ipv4, inp, sopt);
 				break;
 			}
 			/* FALLTHROUGH */
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 		case IP_RETOPTS:
 			INP_RLOCK(inp);
 			if (inp->inp_options) {
 				struct mbuf *options;
 
 				options = m_copym(inp->inp_options, 0,
 				    M_COPYALL, M_NOWAIT);
 				INP_RUNLOCK(inp);
 				if (options != NULL) {
 					error = sooptcopyout(sopt,
 							     mtod(options, char *),
 							     options->m_len);
 					m_freem(options);
 				} else
 					error = ENOMEM;
 			} else {
 				INP_RUNLOCK(inp);
 				sopt->sopt_valsize = 0;
 			}
 			break;
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_ORIGDSTADDR:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_PORTRANGE:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_BINDANY:
 		case IP_RECVTOS:
 		case IP_BINDMULTI:
 		case IP_FLOWID:
 		case IP_FLOWTYPE:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RSSBUCKETID:
 		case IP_RECVRSSBUCKETID:
 #endif
 		case IP_VLAN_PCP:
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				optval = inp->inp_ip_tos;
 				break;
 
 			case IP_TTL:
 				optval = inp->inp_ip_ttl;
 				break;
 
 			case IP_MINTTL:
 				optval = inp->inp_ip_minttl;
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
 #define	OPTBIT2(bit)	(inp->inp_flags2 & bit ? 1 : 0)
 
 			case IP_RECVOPTS:
 				optval = OPTBIT(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				optval = OPTBIT(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				optval = OPTBIT(INP_RECVDSTADDR);
 				break;
 
 			case IP_ORIGDSTADDR:
 				optval = OPTBIT2(INP_ORIGDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				optval = OPTBIT(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				optval = OPTBIT(INP_RECVIF);
 				break;
 
 			case IP_PORTRANGE:
 				if (inp->inp_flags & INP_HIGHPORT)
 					optval = IP_PORTRANGE_HIGH;
 				else if (inp->inp_flags & INP_LOWPORT)
 					optval = IP_PORTRANGE_LOW;
 				else
 					optval = 0;
 				break;
 
 			case IP_ONESBCAST:
 				optval = OPTBIT(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				optval = OPTBIT(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				optval = OPTBIT(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				optval = OPTBIT(INP_RECVTOS);
 				break;
 			case IP_FLOWID:
 				optval = inp->inp_flowid;
 				break;
 			case IP_FLOWTYPE:
 				optval = inp->inp_flowtype;
 				break;
 			case IP_RECVFLOWID:
 				optval = OPTBIT2(INP_RECVFLOWID);
 				break;
 #ifdef	RSS
 			case IP_RSSBUCKETID:
 				retval = rss_hash2bucket(inp->inp_flowid,
 				    inp->inp_flowtype,
 				    &rss_bucket);
 				if (retval == 0)
 					optval = rss_bucket;
 				else
 					error = EINVAL;
 				break;
 			case IP_RECVRSSBUCKETID:
 				optval = OPTBIT2(INP_RECVRSSBUCKETID);
 				break;
 #endif
 			case IP_BINDMULTI:
 				optval = OPTBIT2(INP_BINDMULTI);
 				break;
 			case IP_VLAN_PCP:
 				if (OPTBIT2(INP_2PCP_SET)) {
 					optval = (inp->inp_flags2 &
 					    INP_2PCP_MASK) >> INP_2PCP_SHIFT;
 				} else {
 					optval = -1;
 				}
 				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_MSFILTER:
 			error = inp_getmoptions(inp, sopt);
 			break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		case IP_IPSEC_POLICY:
 			if (IPSEC_ENABLED(ipv4)) {
 				error = IPSEC_PCBCTL(ipv4, inp, sopt);
 				break;
 			}
 			/* FALLTHROUGH */
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 /*
  * Routine called from ip_output() to loop back a copy of an IP multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be a loopback interface -- evil, but easier than
  * replicating that code here.
  */
 static void
 ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
 {
 	struct ip *ip;
 	struct mbuf *copym;
 
 	/*
 	 * Make a deep copy of the packet because we're going to
 	 * modify the pack in order to generate checksums.
 	 */
 	copym = m_dup(m, M_NOWAIT);
 	if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
 		copym = m_pullup(copym, hlen);
 	if (copym != NULL) {
 		/* If needed, compute the checksum and mark it as valid. */
 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			in_delayed_cksum(copym);
 			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 			copym->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			copym->m_pkthdr.csum_data = 0xffff;
 		}
 		/*
 		 * We don't bother to fragment if the IP length is greater
 		 * than the interface's MTU.  Can this possibly matter?
 		 */
 		ip = mtod(copym, struct ip *);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(copym, hlen);
 		if_simloop(ifp, copym, AF_INET, 0);
 	}
 }
diff --git a/sys/netinet/ip_reass.c b/sys/netinet/ip_reass.c
index 06cdbca2961c..5209dfb0d1cb 100644
--- a/sys/netinet/ip_reass.c
+++ b/sys/netinet/ip_reass.c
@@ -1,980 +1,981 @@
 /*-
  * Copyright (c) 2015 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright (c) 2015 Adrian Chadd <adrian@FreeBSD.org>
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/hash.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/rss_config.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_rss.h>
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 SYSCTL_DECL(_net_inet_ip);
 
 /*
  * Reassembly headers are stored in hash buckets.
  */
 #define	IPREASS_NHASH_LOG2	10
 #define	IPREASS_NHASH		(1 << IPREASS_NHASH_LOG2)
 #define	IPREASS_HMASK		(V_ipq_hashsize - 1)
 
 struct ipqbucket {
 	TAILQ_HEAD(ipqhead, ipq) head;
 	struct mtx		 lock;
 	struct callout		 timer;
 #ifdef VIMAGE
 	struct vnet		 *vnet;
 #endif
 	int			 count;
 };
 
 VNET_DEFINE_STATIC(struct ipqbucket *, ipq);
 #define	V_ipq		VNET(ipq)
 VNET_DEFINE_STATIC(uint32_t, ipq_hashseed);
 #define	V_ipq_hashseed	VNET(ipq_hashseed)
 VNET_DEFINE_STATIC(uint32_t, ipq_hashsize);
 #define	V_ipq_hashsize	VNET(ipq_hashsize)
 
 #define	IPQ_LOCK(i)	mtx_lock(&V_ipq[i].lock)
 #define	IPQ_TRYLOCK(i)	mtx_trylock(&V_ipq[i].lock)
 #define	IPQ_UNLOCK(i)	mtx_unlock(&V_ipq[i].lock)
 #define	IPQ_LOCK_ASSERT(i)	mtx_assert(&V_ipq[i].lock, MA_OWNED)
 #define	IPQ_BUCKET_LOCK_ASSERT(b)	mtx_assert(&(b)->lock, MA_OWNED)
 
 VNET_DEFINE_STATIC(int, ipreass_maxbucketsize);
 #define	V_ipreass_maxbucketsize	VNET(ipreass_maxbucketsize)
 
 void		ipreass_init(void);
 void		ipreass_vnet_init(void);
 #ifdef VIMAGE
 void		ipreass_destroy(void);
 #endif
 static int	sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
 static int	sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS);
 static int	sysctl_fragttl(SYSCTL_HANDLER_ARGS);
 static void	ipreass_zone_change(void *);
 static void	ipreass_drain_tomax(void);
 static void	ipq_free(struct ipqbucket *, struct ipq *);
 static struct ipq * ipq_reuse(int);
 static void	ipreass_callout(void *);
 static void	ipreass_reschedule(struct ipqbucket *);
 
 static inline void
 ipq_timeout(struct ipqbucket *bucket, struct ipq *fp)
 {
 
 	IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
 	ipq_free(bucket, fp);
 }
 
 static inline void
 ipq_drop(struct ipqbucket *bucket, struct ipq *fp)
 {
 
 	IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
 	ipq_free(bucket, fp);
 	ipreass_reschedule(bucket);
 }
 
 /*
  * By default, limit the number of IP fragments across all reassembly
  * queues to  1/32 of the total number of mbuf clusters.
  *
  * Limit the total number of reassembly queues per VNET to the
  * IP fragment limit, but ensure the limit will not allow any bucket
  * to grow above 100 items. (The bucket limit is
  * IP_MAXFRAGPACKETS / (V_ipq_hashsize / 2), so the 50 is the correct
  * multiplier to reach a 100-item limit.)
  * The 100-item limit was chosen as brief testing seems to show that
  * this produces "reasonable" performance on some subset of systems
  * under DoS attack.
  */
 #define	IP_MAXFRAGS		(nmbclusters / 32)
 #define	IP_MAXFRAGPACKETS	(imin(IP_MAXFRAGS, V_ipq_hashsize * 50))
 
 static int		maxfrags;
 static u_int __exclusive_cache_line	nfrags;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW,
     &maxfrags, 0,
     "Maximum number of IPv4 fragments allowed across all reassembly queues");
 SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD,
     &nfrags, 0,
     "Current number of IPv4 fragments across all reassembly queues");
 
 VNET_DEFINE_STATIC(uma_zone_t, ipq_zone);
 #define	V_ipq_zone	VNET(ipq_zone)
 
 SYSCTL_UINT(_net_inet_ip, OID_AUTO, reass_hashsize,
     CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(ipq_hashsize), 0,
     "Size of IP fragment reassembly hashtable");
 
 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     NULL, 0, sysctl_maxfragpackets, "I",
     "Maximum number of IPv4 fragment reassembly queue entries");
 SYSCTL_UMA_CUR(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_VNET,
     &VNET_NAME(ipq_zone),
     "Current number of IPv4 fragment reassembly queue entries");
 
 VNET_DEFINE_STATIC(int, noreass);
 #define	V_noreass	VNET(noreass)
 
 VNET_DEFINE_STATIC(int, maxfragsperpacket);
 #define	V_maxfragsperpacket	VNET(maxfragsperpacket)
 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(maxfragsperpacket), 0,
     "Maximum number of IPv4 fragments allowed per packet");
 SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
     sysctl_maxfragbucketsize, "I",
     "Maximum number of IPv4 fragment reassembly queue entries per bucket");
 
 VNET_DEFINE_STATIC(u_int, ipfragttl) = 30;
 #define	V_ipfragttl	VNET(ipfragttl)
 SYSCTL_PROC(_net_inet_ip, OID_AUTO, fragttl, CTLTYPE_INT | CTLFLAG_RW |
     CTLFLAG_MPSAFE | CTLFLAG_VNET, NULL, 0, sysctl_fragttl, "IU",
     "IP fragment life time on reassembly queue (seconds)");
 
 /*
  * Take incoming datagram fragment and try to reassemble it into
  * whole datagram.  If the argument is the first fragment or one
  * in between the function will return NULL and store the mbuf
  * in the fragment chain.  If the argument is the last fragment
  * the packet will be reassembled and the pointer to the new
  * mbuf returned for further processing.  Only m_tags attached
  * to the first packet/fragment are preserved.
  * The IP header is *NOT* adjusted out of iplen.
  */
 #define	M_IP_FRAG	M_PROTO9
 struct mbuf *
 ip_reass(struct mbuf *m)
 {
 	struct ip *ip;
 	struct mbuf *p, *q, *nq, *t;
 	struct ipq *fp;
 	struct ifnet *srcifp;
 	struct ipqhead *head;
 	int i, hlen, next, tmpmax;
 	u_int8_t ecn, ecn0;
 	uint32_t hash, hashkey[3];
 #ifdef	RSS
 	uint32_t rss_hash, rss_type;
 #endif
 
 	/*
 	 * If no reassembling or maxfragsperpacket are 0,
 	 * never accept fragments.
 	 * Also, drop packet if it would exceed the maximum
 	 * number of fragments.
 	 */
 	tmpmax = maxfrags;
 	if (V_noreass == 1 || V_maxfragsperpacket == 0 ||
 	    (tmpmax >= 0 && atomic_load_int(&nfrags) >= (u_int)tmpmax)) {
 		IPSTAT_INC(ips_fragments);
 		IPSTAT_INC(ips_fragdropped);
 		m_freem(m);
 		return (NULL);
 	}
 
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 
 	/*
 	 * Adjust ip_len to not reflect header,
 	 * convert offset of this to bytes.
 	 */
 	ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
 	/*
 	 * Make sure that fragments have a data length
 	 * that's a non-zero multiple of 8 bytes, unless
 	 * this is the last fragment.
 	 */
 	if (ip->ip_len == htons(0) ||
 	    ((ip->ip_off & htons(IP_MF)) && (ntohs(ip->ip_len) & 0x7) != 0)) {
 		IPSTAT_INC(ips_toosmall); /* XXX */
 		IPSTAT_INC(ips_fragdropped);
 		m_freem(m);
 		return (NULL);
 	}
 	if (ip->ip_off & htons(IP_MF))
 		m->m_flags |= M_IP_FRAG;
 	else
 		m->m_flags &= ~M_IP_FRAG;
 	ip->ip_off = htons(ntohs(ip->ip_off) << 3);
 
 	/*
 	 * Make sure the fragment lies within a packet of valid size.
 	 */
 	if (ntohs(ip->ip_len) + ntohs(ip->ip_off) > IP_MAXPACKET) {
 		IPSTAT_INC(ips_toolong);
 		IPSTAT_INC(ips_fragdropped);
 		m_freem(m);
 		return (NULL);
 	}
 
 	/*
 	 * Store receive network interface pointer for later.
 	 */
 	srcifp = m->m_pkthdr.rcvif;
 
 	/*
 	 * Attempt reassembly; if it succeeds, proceed.
 	 * ip_reass() will return a different mbuf.
 	 */
 	IPSTAT_INC(ips_fragments);
 	m->m_pkthdr.PH_loc.ptr = ip;
 
 	/*
 	 * Presence of header sizes in mbufs
 	 * would confuse code below.
 	 */
 	m->m_data += hlen;
 	m->m_len -= hlen;
 
 	hashkey[0] = ip->ip_src.s_addr;
 	hashkey[1] = ip->ip_dst.s_addr;
 	hashkey[2] = (uint32_t)ip->ip_p << 16;
 	hashkey[2] += ip->ip_id;
 	hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed);
 	hash &= IPREASS_HMASK;
 	head = &V_ipq[hash].head;
 	IPQ_LOCK(hash);
 
 	/*
 	 * Look for queue of fragments
 	 * of this datagram.
 	 */
 	TAILQ_FOREACH(fp, head, ipq_list)
 		if (ip->ip_id == fp->ipq_id &&
 		    ip->ip_src.s_addr == fp->ipq_src.s_addr &&
 		    ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
 #ifdef MAC
 		    mac_ipq_match(m, fp) &&
 #endif
 		    ip->ip_p == fp->ipq_p)
 			break;
 	/*
 	 * If first fragment to arrive, create a reassembly queue.
 	 */
 	if (fp == NULL) {
 		if (V_ipq[hash].count < V_ipreass_maxbucketsize)
 			fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
 		if (fp == NULL)
 			fp = ipq_reuse(hash);
 		if (fp == NULL)
 			goto dropfrag;
 #ifdef MAC
 		if (mac_ipq_init(fp, M_NOWAIT) != 0) {
 			uma_zfree(V_ipq_zone, fp);
 			fp = NULL;
 			goto dropfrag;
 		}
 		mac_ipq_create(m, fp);
 #endif
 		TAILQ_INSERT_HEAD(head, fp, ipq_list);
 		V_ipq[hash].count++;
 		fp->ipq_nfrags = 1;
 		atomic_add_int(&nfrags, 1);
 		fp->ipq_expire = time_uptime + V_ipfragttl;
 		fp->ipq_p = ip->ip_p;
 		fp->ipq_id = ip->ip_id;
 		fp->ipq_src = ip->ip_src;
 		fp->ipq_dst = ip->ip_dst;
 		fp->ipq_frags = m;
 		if (m->m_flags & M_IP_FRAG)
 			fp->ipq_maxoff = -1;
 		else
 			fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len);
 		m->m_nextpkt = NULL;
 		if (fp == TAILQ_LAST(head, ipqhead))
 			callout_reset_sbt(&V_ipq[hash].timer,
 			    SBT_1S * V_ipfragttl, SBT_1S, ipreass_callout,
 			    &V_ipq[hash], 0);
 		else
 			MPASS(callout_active(&V_ipq[hash].timer));
 		goto done;
 	} else {
 		/*
 		 * If we already saw the last fragment, make sure
 		 * this fragment's offset looks sane. Otherwise, if
 		 * this is the last fragment, record its endpoint.
 		 */
 		if (fp->ipq_maxoff > 0) {
 			i = ntohs(ip->ip_off) + ntohs(ip->ip_len);
 			if (((m->m_flags & M_IP_FRAG) && i >= fp->ipq_maxoff) ||
 			    ((m->m_flags & M_IP_FRAG) == 0 &&
 			    i != fp->ipq_maxoff)) {
 				fp = NULL;
 				goto dropfrag;
 			}
 		} else if ((m->m_flags & M_IP_FRAG) == 0)
 			fp->ipq_maxoff = ntohs(ip->ip_off) + ntohs(ip->ip_len);
 		fp->ipq_nfrags++;
 		atomic_add_int(&nfrags, 1);
 #ifdef MAC
 		mac_ipq_update(m, fp);
 #endif
 	}
 
 #define GETIP(m)	((struct ip*)((m)->m_pkthdr.PH_loc.ptr))
 
 	/*
 	 * Handle ECN by comparing this segment with the first one;
 	 * if CE is set, do not lose CE.
 	 * drop if CE and not-ECT are mixed for the same packet.
 	 */
 	ecn = ip->ip_tos & IPTOS_ECN_MASK;
 	ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
 	if (ecn == IPTOS_ECN_CE) {
 		if (ecn0 == IPTOS_ECN_NOTECT)
 			goto dropfrag;
 		if (ecn0 != IPTOS_ECN_CE)
 			GETIP(fp->ipq_frags)->ip_tos |= IPTOS_ECN_CE;
 	}
 	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
 		goto dropfrag;
 
 	/*
 	 * Find a segment which begins after this one does.
 	 */
 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt)
 		if (ntohs(GETIP(q)->ip_off) > ntohs(ip->ip_off))
 			break;
 
 	/*
 	 * If there is a preceding segment, it may provide some of
 	 * our data already.  If so, drop the data from the incoming
 	 * segment.  If it provides all of our data, drop us, otherwise
 	 * stick new segment in the proper place.
 	 *
 	 * If some of the data is dropped from the preceding
 	 * segment, then it's checksum is invalidated.
 	 */
 	if (p) {
 		i = ntohs(GETIP(p)->ip_off) + ntohs(GETIP(p)->ip_len) -
 		    ntohs(ip->ip_off);
 		if (i > 0) {
 			if (i >= ntohs(ip->ip_len))
 				goto dropfrag;
 			m_adj(m, i);
 			m->m_pkthdr.csum_flags = 0;
 			ip->ip_off = htons(ntohs(ip->ip_off) + i);
 			ip->ip_len = htons(ntohs(ip->ip_len) - i);
 		}
 		m->m_nextpkt = p->m_nextpkt;
 		p->m_nextpkt = m;
 	} else {
 		m->m_nextpkt = fp->ipq_frags;
 		fp->ipq_frags = m;
 	}
 
 	/*
 	 * While we overlap succeeding segments trim them or,
 	 * if they are completely covered, dequeue them.
 	 */
 	for (; q != NULL && ntohs(ip->ip_off) + ntohs(ip->ip_len) >
 	    ntohs(GETIP(q)->ip_off); q = nq) {
 		i = (ntohs(ip->ip_off) + ntohs(ip->ip_len)) -
 		    ntohs(GETIP(q)->ip_off);
 		if (i < ntohs(GETIP(q)->ip_len)) {
 			GETIP(q)->ip_len = htons(ntohs(GETIP(q)->ip_len) - i);
 			GETIP(q)->ip_off = htons(ntohs(GETIP(q)->ip_off) + i);
 			m_adj(q, i);
 			q->m_pkthdr.csum_flags = 0;
 			break;
 		}
 		nq = q->m_nextpkt;
 		m->m_nextpkt = nq;
 		IPSTAT_INC(ips_fragdropped);
 		fp->ipq_nfrags--;
 		atomic_subtract_int(&nfrags, 1);
 		m_freem(q);
 	}
 
 	/*
 	 * Check for complete reassembly and perform frag per packet
 	 * limiting.
 	 *
 	 * Frag limiting is performed here so that the nth frag has
 	 * a chance to complete the packet before we drop the packet.
 	 * As a result, n+1 frags are actually allowed per packet, but
 	 * only n will ever be stored. (n = maxfragsperpacket.)
 	 *
 	 */
 	next = 0;
 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
 		if (ntohs(GETIP(q)->ip_off) != next) {
 			if (fp->ipq_nfrags > V_maxfragsperpacket)
 				ipq_drop(&V_ipq[hash], fp);
 			goto done;
 		}
 		next += ntohs(GETIP(q)->ip_len);
 	}
 	/* Make sure the last packet didn't have the IP_MF flag */
 	if (p->m_flags & M_IP_FRAG) {
 		if (fp->ipq_nfrags > V_maxfragsperpacket)
 			ipq_drop(&V_ipq[hash], fp);
 		goto done;
 	}
 
 	/*
 	 * Reassembly is complete.  Make sure the packet is a sane size.
 	 */
 	q = fp->ipq_frags;
 	ip = GETIP(q);
 	if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
 		IPSTAT_INC(ips_toolong);
 		ipq_drop(&V_ipq[hash], fp);
 		goto done;
 	}
 
 	/*
 	 * Concatenate fragments.
 	 */
 	m = q;
 	t = m->m_next;
 	m->m_next = NULL;
 	m_cat(m, t);
 	nq = q->m_nextpkt;
 	q->m_nextpkt = NULL;
 	for (q = nq; q != NULL; q = nq) {
 		nq = q->m_nextpkt;
 		q->m_nextpkt = NULL;
 		m->m_pkthdr.csum_flags &= q->m_pkthdr.csum_flags;
 		m->m_pkthdr.csum_data += q->m_pkthdr.csum_data;
 		m_demote_pkthdr(q);
 		m_cat(m, q);
 	}
 	/*
 	 * In order to do checksumming faster we do 'end-around carry' here
 	 * (and not in for{} loop), though it implies we are not going to
 	 * reassemble more than 64k fragments.
 	 */
 	while (m->m_pkthdr.csum_data & 0xffff0000)
 		m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
 		    (m->m_pkthdr.csum_data >> 16);
 	atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 #ifdef MAC
 	mac_ipq_reassemble(fp, m);
 	mac_ipq_destroy(fp);
 #endif
 
 	/*
 	 * Create header for new ip packet by modifying header of first
 	 * packet;  dequeue and discard fragment reassembly header.
 	 * Make header visible.
 	 */
 	ip->ip_len = htons((ip->ip_hl << 2) + next);
 	ip->ip_src = fp->ipq_src;
 	ip->ip_dst = fp->ipq_dst;
 	TAILQ_REMOVE(head, fp, ipq_list);
 	V_ipq[hash].count--;
 	uma_zfree(V_ipq_zone, fp);
 	m->m_len += (ip->ip_hl << 2);
 	m->m_data -= (ip->ip_hl << 2);
 	/* some debugging cruft by sklower, below, will go away soon */
 	if (m->m_flags & M_PKTHDR) {	/* XXX this should be done elsewhere */
 		m_fixhdr(m);
 		/* set valid receive interface pointer */
 		m->m_pkthdr.rcvif = srcifp;
 	}
 	IPSTAT_INC(ips_reassembled);
 	ipreass_reschedule(&V_ipq[hash]);
 	IPQ_UNLOCK(hash);
 
 #ifdef	RSS
 	/*
 	 * Query the RSS layer for the flowid / flowtype for the
 	 * mbuf payload.
 	 *
 	 * For now, just assume we have to calculate a new one.
 	 * Later on we should check to see if the assigned flowid matches
 	 * what RSS wants for the given IP protocol and if so, just keep it.
 	 *
 	 * We then queue into the relevant netisr so it can be dispatched
 	 * to the correct CPU.
 	 *
 	 * Note - this may return 1, which means the flowid in the mbuf
 	 * is correct for the configured RSS hash types and can be used.
 	 */
 	if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) {
 		m->m_pkthdr.flowid = rss_hash;
 		M_HASHTYPE_SET(m, rss_type);
 	}
 
 	/*
 	 * Queue/dispatch for reprocessing.
 	 *
 	 * Note: this is much slower than just handling the frame in the
 	 * current receive context.  It's likely worth investigating
 	 * why this is.
 	 */
 	netisr_dispatch(NETISR_IP_DIRECT, m);
 	return (NULL);
 #endif
 
 	/* Handle in-line */
 	return (m);
 
 dropfrag:
 	IPSTAT_INC(ips_fragdropped);
 	if (fp != NULL) {
 		fp->ipq_nfrags--;
 		atomic_subtract_int(&nfrags, 1);
 	}
 	m_freem(m);
 done:
 	IPQ_UNLOCK(hash);
 	return (NULL);
 
 #undef GETIP
 }
 
 /*
  * Timer expired on a bucket.
  * There should be at least one ipq to be timed out.
  */
 static void
 ipreass_callout(void *arg)
 {
 	struct ipqbucket *bucket = arg;
 	struct ipq *fp;
 
 	IPQ_BUCKET_LOCK_ASSERT(bucket);
 	MPASS(atomic_load_int(&nfrags) > 0);
 
 	CURVNET_SET(bucket->vnet);
 	fp = TAILQ_LAST(&bucket->head, ipqhead);
 	KASSERT(fp != NULL && fp->ipq_expire <= time_uptime,
 	    ("%s: stray callout on bucket %p, %ju < %ju", __func__, bucket,
 	    fp ? (uintmax_t)fp->ipq_expire : 0, (uintmax_t)time_uptime));
 
 	while (fp != NULL && fp->ipq_expire <= time_uptime) {
 		ipq_timeout(bucket, fp);
 		fp = TAILQ_LAST(&bucket->head, ipqhead);
 	}
 	ipreass_reschedule(bucket);
 	CURVNET_RESTORE();
 }
 
 static void
 ipreass_reschedule(struct ipqbucket *bucket)
 {
 	struct ipq *fp;
 
 	IPQ_BUCKET_LOCK_ASSERT(bucket);
 
 	if ((fp = TAILQ_LAST(&bucket->head, ipqhead)) != NULL) {
 		time_t t;
 
 		/* Protect against time_uptime tick. */
 		t = fp->ipq_expire - time_uptime;
 		t = (t > 0) ? t : 1;
 		callout_reset_sbt(&bucket->timer, SBT_1S * t, SBT_1S,
 		    ipreass_callout, bucket, 0);
 	} else
 		callout_stop(&bucket->timer);
 }
 
 static void
 ipreass_drain_vnet(void)
 {
 	u_int dropped = 0;
 
 	for (int i = 0; i < V_ipq_hashsize; i++) {
 		bool resched;
 
 		IPQ_LOCK(i);
 		resched = !TAILQ_EMPTY(&V_ipq[i].head);
 		while(!TAILQ_EMPTY(&V_ipq[i].head)) {
 			struct ipq *fp = TAILQ_FIRST(&V_ipq[i].head);
 
 			dropped += fp->ipq_nfrags;
 			ipq_free(&V_ipq[i], fp);
 		}
 		if (resched)
 			ipreass_reschedule(&V_ipq[i]);
 		KASSERT(V_ipq[i].count == 0,
 		    ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i,
 		    V_ipq[i].count, V_ipq));
 		IPQ_UNLOCK(i);
 	}
 	IPSTAT_ADD(ips_fragdropped, dropped);
 }
 
 /*
  * Drain off all datagram fragments.
  */
 static void
 ipreass_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		ipreass_drain_vnet();
 		CURVNET_RESTORE();
 	}
 }
 
 
 /*
  * Initialize IP reassembly structures.
  */
 MALLOC_DEFINE(M_IPREASS_HASH, "IP reass", "IP packet reassembly hash headers");
 void
 ipreass_vnet_init(void)
 {
 	int max;
 
 	V_ipq_hashsize = IPREASS_NHASH;
 	TUNABLE_INT_FETCH("net.inet.ip.reass_hashsize", &V_ipq_hashsize);
 	V_ipq = malloc(sizeof(struct ipqbucket) * V_ipq_hashsize,
 	    M_IPREASS_HASH, M_WAITOK);
 
 	for (int i = 0; i < V_ipq_hashsize; i++) {
 		TAILQ_INIT(&V_ipq[i].head);
 		mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
 		    MTX_DEF | MTX_DUPOK | MTX_NEW);
 		callout_init_mtx(&V_ipq[i].timer, &V_ipq[i].lock, 0);
 		V_ipq[i].count = 0;
 #ifdef VIMAGE
 		V_ipq[i].vnet = curvnet;
 #endif
 	}
 	V_ipq_hashseed = arc4random();
 	V_maxfragsperpacket = 16;
 	V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
 	    NULL, UMA_ALIGN_PTR, 0);
 	max = IP_MAXFRAGPACKETS;
 	max = uma_zone_set_max(V_ipq_zone, max);
 	V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1);
 }
 
 void
 ipreass_init(void)
 {
 
 	maxfrags = IP_MAXFRAGS;
 	EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	EVENTHANDLER_REGISTER(vm_lowmem, ipreass_drain, NULL,
 	    LOWMEM_PRI_DEFAULT);
 	EVENTHANDLER_REGISTER(mbuf_lowmem, ipreass_drain, NULL,
 		LOWMEM_PRI_DEFAULT);
 }
 
 /*
  * Drain off all datagram fragments belonging to
  * the given network interface.
  */
 static void
 ipreass_cleanup(void *arg __unused, struct ifnet *ifp)
 {
 	struct ipq *fp, *temp;
 	struct mbuf *m;
 	int i;
 
 	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 
 	/*
 	 * Skip processing if IPv4 reassembly is not initialised or
 	 * torn down by ipreass_destroy().
 	 */
 	if (V_ipq_zone == NULL) {
 		CURVNET_RESTORE();
 		return;
 	}
 
 	for (i = 0; i < V_ipq_hashsize; i++) {
 		IPQ_LOCK(i);
 		/* Scan fragment list. */
 		TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, temp) {
 			for (m = fp->ipq_frags; m != NULL; m = m->m_nextpkt) {
 				/* clear no longer valid rcvif pointer */
 				if (m->m_pkthdr.rcvif == ifp)
 					m->m_pkthdr.rcvif = NULL;
 			}
 		}
 		IPQ_UNLOCK(i);
 	}
 	CURVNET_RESTORE();
 }
 EVENTHANDLER_DEFINE(ifnet_departure_event, ipreass_cleanup, NULL, 0);
 
 #ifdef VIMAGE
 /*
  * Destroy IP reassembly structures.
  */
 void
 ipreass_destroy(void)
 {
 
 	ipreass_drain_vnet();
 	uma_zdestroy(V_ipq_zone);
 	V_ipq_zone = NULL;
 	for (int i = 0; i < V_ipq_hashsize; i++)
 		mtx_destroy(&V_ipq[i].lock);
 	free(V_ipq, M_IPREASS_HASH);
 }
 #endif
 
 /*
  * After maxnipq has been updated, propagate the change to UMA.  The UMA zone
  * max has slightly different semantics than the sysctl, for historical
  * reasons.
  */
 static void
 ipreass_drain_tomax(void)
 {
 	struct ipq *fp;
 	int target;
 
 	/*
 	 * Make sure each bucket is under the new limit. If
 	 * necessary, drop enough of the oldest elements from
 	 * each bucket to get under the new limit.
 	 */
 	for (int i = 0; i < V_ipq_hashsize; i++) {
 		IPQ_LOCK(i);
 		while (V_ipq[i].count > V_ipreass_maxbucketsize &&
 		    (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL)
 			ipq_timeout(&V_ipq[i], fp);
 		ipreass_reschedule(&V_ipq[i]);
 		IPQ_UNLOCK(i);
 	}
 
 	/*
 	 * If we are over the maximum number of fragments,
 	 * drain off enough to get down to the new limit,
 	 * stripping off last elements on queues.  Every
 	 * run we strip the oldest element from each bucket.
 	 */
 	target = uma_zone_get_max(V_ipq_zone);
 	while (uma_zone_get_cur(V_ipq_zone) > target) {
 		for (int i = 0; i < V_ipq_hashsize; i++) {
 			IPQ_LOCK(i);
 			fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
 			if (fp != NULL) {
 				ipq_timeout(&V_ipq[i], fp);
 				ipreass_reschedule(&V_ipq[i]);
 			}
 			IPQ_UNLOCK(i);
 		}
 	}
 }
 
 static void
 ipreass_zone_change(void *tag)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	int max;
 
 	maxfrags = IP_MAXFRAGS;
 	max = IP_MAXFRAGPACKETS;
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		max = uma_zone_set_max(V_ipq_zone, max);
 		V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1);
 		ipreass_drain_tomax();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Change the limit on the UMA zone, or disable the fragment allocation
  * at all.  Since 0 and -1 is a special values here, we need our own handler,
  * instead of sysctl_handle_uma_zone_max().
  */
 static int
 sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
 {
 	int error, max;
 
 	if (V_noreass == 0) {
 		max = uma_zone_get_max(V_ipq_zone);
 		if (max == 0)
 			max = -1;
 	} else
 		max = 0;
 	error = sysctl_handle_int(oidp, &max, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (max > 0) {
 		/*
 		 * XXXRW: Might be a good idea to sanity check the argument
 		 * and place an extreme upper bound.
 		 */
 		max = uma_zone_set_max(V_ipq_zone, max);
 		V_ipreass_maxbucketsize = imax(max / (V_ipq_hashsize / 2), 1);
 		ipreass_drain_tomax();
 		V_noreass = 0;
 	} else if (max == 0) {
 		V_noreass = 1;
 		ipreass_drain();
 	} else if (max == -1) {
 		V_noreass = 0;
 		uma_zone_set_max(V_ipq_zone, 0);
 		V_ipreass_maxbucketsize = INT_MAX;
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Seek for old fragment queue header that can be reused.  Try to
  * reuse a header from currently locked hash bucket.
  */
 static struct ipq *
 ipq_reuse(int start)
 {
 	struct ipq *fp;
 	int bucket, i;
 
 	IPQ_LOCK_ASSERT(start);
 
 	for (i = 0; i < V_ipq_hashsize; i++) {
 		bucket = (start + i) % V_ipq_hashsize;
 		if (bucket != start && IPQ_TRYLOCK(bucket) == 0)
 			continue;
 		fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead);
 		if (fp) {
 			struct mbuf *m;
 
 			IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
 			atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 			while (fp->ipq_frags) {
 				m = fp->ipq_frags;
 				fp->ipq_frags = m->m_nextpkt;
 				m_freem(m);
 			}
 			TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list);
 			V_ipq[bucket].count--;
 			ipreass_reschedule(&V_ipq[bucket]);
 			if (bucket != start)
 				IPQ_UNLOCK(bucket);
 			break;
 		}
 		if (bucket != start)
 			IPQ_UNLOCK(bucket);
 	}
 	IPQ_LOCK_ASSERT(start);
 	return (fp);
 }
 
 /*
  * Free a fragment reassembly header and all associated datagrams.
  */
 static void
 ipq_free(struct ipqbucket *bucket, struct ipq *fp)
 {
 	struct mbuf *q;
 
 	atomic_subtract_int(&nfrags, fp->ipq_nfrags);
 	while (fp->ipq_frags) {
 		q = fp->ipq_frags;
 		fp->ipq_frags = q->m_nextpkt;
 		m_freem(q);
 	}
 	TAILQ_REMOVE(&bucket->head, fp, ipq_list);
 	bucket->count--;
 	uma_zfree(V_ipq_zone, fp);
 }
 
 /*
  * Get or set the maximum number of reassembly queues per bucket.
  */
 static int
 sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS)
 {
 	int error, max;
 
 	max = V_ipreass_maxbucketsize;
 	error = sysctl_handle_int(oidp, &max, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (max <= 0)
 		return (EINVAL);
 	V_ipreass_maxbucketsize = max;
 	ipreass_drain_tomax();
 	return (0);
 }
 
 /*
  * Get or set the IP fragment time to live.
  */
 static int
 sysctl_fragttl(SYSCTL_HANDLER_ARGS)
 {
 	u_int ttl;
 	int error;
 
 	ttl = V_ipfragttl;
 	error = sysctl_handle_int(oidp, &ttl, 0, req);
 	if (error || !req->newptr)
 		return (error);
 
 	if (ttl < 1 || ttl > MAXTTL)
 		return (EINVAL);
 
 	atomic_store_int(&V_ipfragttl, ttl);
 	return (0);
 }
diff --git a/sys/netinet/netdump/netdump_client.c b/sys/netinet/netdump/netdump_client.c
index 95795c73f8d6..c2f10ba52253 100644
--- a/sys/netinet/netdump/netdump_client.c
+++ b/sys/netinet/netdump/netdump_client.c
@@ -1,756 +1,757 @@
 /*-
  * Copyright (c) 2005-2014 Sandvine Incorporated. All rights reserved.
  * Copyright (c) 2000 Darrell Anderson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * netdump_client.c
  * FreeBSD subsystem supporting netdump network dumps.
  * A dedicated server must be running to accept client dumps.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/disk.h>
 #include <sys/endian.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/kerneldump.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_lex.h>
 #endif
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/debugnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/netdump/netdump.h>
 
 #include <machine/in_cksum.h>
 #include <machine/pcb.h>
 
 #define	NETDDEBUGV(f, ...) do {						\
 	if (nd_debug > 1)						\
 		printf(("%s: " f), __func__, ## __VA_ARGS__);		\
 } while (0)
 
 static void	 netdump_cleanup(void);
 static int	 netdump_configure(struct diocskerneldump_arg *,
 		    struct thread *);
 static int	 netdump_dumper(void *priv __unused, void *virtual,
 		    off_t offset, size_t length);
 static bool	 netdump_enabled(void);
 static int	 netdump_enabled_sysctl(SYSCTL_HANDLER_ARGS);
 static int	 netdump_ioctl(struct cdev *dev __unused, u_long cmd,
 		    caddr_t addr, int flags __unused, struct thread *td);
 static int	 netdump_modevent(module_t mod, int type, void *priv);
 static int	 netdump_start(struct dumperinfo *di, void *key,
 		    uint32_t keysize);
 static void	 netdump_unconfigure(void);
 
 /* Must be at least as big as the chunks dumpsys() gives us. */
 static unsigned char nd_buf[MAXDUMPPGS * PAGE_SIZE];
 static int dump_failed;
 
 /* Configuration parameters. */
 static struct {
 	char		 ndc_iface[IFNAMSIZ];
 	union kd_ip	 ndc_server;
 	union kd_ip	 ndc_client;
 	union kd_ip	 ndc_gateway;
 	uint8_t		 ndc_af;
 	/* Runtime State */
 	struct debugnet_pcb *nd_pcb;
 	off_t		 nd_tx_off;
 	size_t		 nd_buf_len;
 } nd_conf;
 #define	nd_server	nd_conf.ndc_server.in4
 #define	nd_client	nd_conf.ndc_client.in4
 #define	nd_gateway	nd_conf.ndc_gateway.in4
 
 /* General dynamic settings. */
 static struct sx nd_conf_lk;
 SX_SYSINIT(nd_conf, &nd_conf_lk, "netdump configuration lock");
 #define NETDUMP_WLOCK()			sx_xlock(&nd_conf_lk)
 #define NETDUMP_WUNLOCK()		sx_xunlock(&nd_conf_lk)
 #define NETDUMP_RLOCK()			sx_slock(&nd_conf_lk)
 #define NETDUMP_RUNLOCK()		sx_sunlock(&nd_conf_lk)
 #define NETDUMP_ASSERT_WLOCKED()	sx_assert(&nd_conf_lk, SA_XLOCKED)
 #define NETDUMP_ASSERT_LOCKED()		sx_assert(&nd_conf_lk, SA_LOCKED)
 static struct ifnet *nd_ifp;
 static eventhandler_tag nd_detach_cookie;
 
 FEATURE(netdump, "Netdump client support");
 
 static SYSCTL_NODE(_net, OID_AUTO, netdump, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
     "netdump parameters");
 
 static int nd_debug;
 SYSCTL_INT(_net_netdump, OID_AUTO, debug, CTLFLAG_RWTUN,
     &nd_debug, 0,
     "Debug message verbosity");
 SYSCTL_PROC(_net_netdump, OID_AUTO, enabled,
     CTLFLAG_RD | CTLTYPE_INT | CTLFLAG_MPSAFE, NULL, 0,
     netdump_enabled_sysctl, "I",
     "netdump configuration status");
 static char nd_path[MAXPATHLEN];
 SYSCTL_STRING(_net_netdump, OID_AUTO, path, CTLFLAG_RW,
     nd_path, sizeof(nd_path),
     "Server path for output files");
 /*
  * The following three variables were moved to debugnet(4), but these knobs
  * were retained as aliases.
  */
 SYSCTL_INT(_net_netdump, OID_AUTO, polls, CTLFLAG_RWTUN,
     &debugnet_npolls, 0,
     "Number of times to poll before assuming packet loss (0.5ms per poll)");
 SYSCTL_INT(_net_netdump, OID_AUTO, retries, CTLFLAG_RWTUN,
     &debugnet_nretries, 0,
     "Number of retransmit attempts before giving up");
 SYSCTL_INT(_net_netdump, OID_AUTO, arp_retries, CTLFLAG_RWTUN,
     &debugnet_arp_nretries, 0,
     "Number of ARP attempts before giving up");
 
 static bool nd_is_enabled;
 static bool
 netdump_enabled(void)
 {
 
 	NETDUMP_ASSERT_LOCKED();
 	return (nd_is_enabled);
 }
 
 static void
 netdump_set_enabled(bool status)
 {
 
 	NETDUMP_ASSERT_LOCKED();
 	nd_is_enabled = status;
 }
 
 static int
 netdump_enabled_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int en, error;
 
 	NETDUMP_RLOCK();
 	en = netdump_enabled();
 	NETDUMP_RUNLOCK();
 
 	error = SYSCTL_OUT(req, &en, sizeof(en));
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	return (EPERM);
 }
 
 /*-
  * Dumping specific primitives.
  */
 
 /*
  * Flush any buffered vmcore data.
  */
 static int
 netdump_flush_buf(void)
 {
 	int error;
 
 	error = 0;
 	if (nd_conf.nd_buf_len != 0) {
 		struct debugnet_proto_aux auxdata = {
 			.dp_offset_start = nd_conf.nd_tx_off,
 		};
 		error = debugnet_send(nd_conf.nd_pcb, DEBUGNET_DATA, nd_buf,
 		    nd_conf.nd_buf_len, &auxdata);
 		if (error == 0)
 			nd_conf.nd_buf_len = 0;
 	}
 	return (error);
 }
 
 /*
  * Callback from dumpsys() to dump a chunk of memory.
  * Copies it out to our static buffer then sends it across the network.
  * Detects the initial KDH and makes sure it is given a special packet type.
  *
  * Parameters:
  *	priv	 Unused. Optional private pointer.
  *	virtual  Virtual address (where to read the data from)
  *	offset	 Offset from start of core file
  *	length	 Data length
  *
  * Return value:
  *	0 on success
  *	errno on error
  */
 static int
 netdump_dumper(void *priv __unused, void *virtual, off_t offset, size_t length)
 {
 	int error;
 
 	NETDDEBUGV("netdump_dumper(NULL, %p, NULL, %ju, %zu)\n",
 	    virtual, (uintmax_t)offset, length);
 
 	if (virtual == NULL) {
 		error = netdump_flush_buf();
 		if (error != 0)
 			dump_failed = 1;
 
 		if (dump_failed != 0)
 			printf("failed to dump the kernel core\n");
 		else if (
 		    debugnet_sendempty(nd_conf.nd_pcb, DEBUGNET_FINISHED) != 0)
 			printf("failed to close the transaction\n");
 		else
 			printf("\nnetdump finished.\n");
 		netdump_cleanup();
 		return (0);
 	}
 	if (length > sizeof(nd_buf)) {
 		netdump_cleanup();
 		return (ENOSPC);
 	}
 
 	if (nd_conf.nd_buf_len + length > sizeof(nd_buf) ||
 	    (nd_conf.nd_buf_len != 0 && nd_conf.nd_tx_off +
 	    nd_conf.nd_buf_len != offset)) {
 		error = netdump_flush_buf();
 		if (error != 0) {
 			dump_failed = 1;
 			netdump_cleanup();
 			return (error);
 		}
 		nd_conf.nd_tx_off = offset;
 	}
 
 	memmove(nd_buf + nd_conf.nd_buf_len, virtual, length);
 	nd_conf.nd_buf_len += length;
 
 	return (0);
 }
 
 /*
  * Perform any initialization needed prior to transmitting the kernel core.
  */
 static int
 netdump_start(struct dumperinfo *di, void *key, uint32_t keysize)
 {
 	struct debugnet_conn_params dcp;
 	struct debugnet_pcb *pcb;
 	char buf[INET_ADDRSTRLEN];
 	int error;
 
 	error = 0;
 
 	/* Check if the dumping is allowed to continue. */
 	if (!netdump_enabled())
 		return (EINVAL);
 
 	if (!KERNEL_PANICKED()) {
 		printf(
 		    "netdump_start: netdump may only be used after a panic\n");
 		return (EINVAL);
 	}
 
 	memset(&dcp, 0, sizeof(dcp));
 
 	if (nd_server.s_addr == INADDR_ANY) {
 		printf("netdump_start: can't netdump; no server IP given\n");
 		return (EINVAL);
 	}
 
 	/* We start dumping at offset 0. */
 	di->dumpoff = 0;
 
 	dcp.dc_ifp = nd_ifp;
 
 	dcp.dc_client = nd_client.s_addr;
 	dcp.dc_server = nd_server.s_addr;
 	dcp.dc_gateway = nd_gateway.s_addr;
 
 	dcp.dc_herald_port = NETDUMP_PORT;
 	dcp.dc_client_port = NETDUMP_ACKPORT;
 
 	dcp.dc_herald_data = nd_path;
 	dcp.dc_herald_datalen = (nd_path[0] == 0) ? 0 : strlen(nd_path) + 1;
 
 	error = debugnet_connect(&dcp, &pcb);
 	if (error != 0) {
 		printf("failed to contact netdump server\n");
 		/* Squash debugnet to something the dumper code understands. */
 		return (EINVAL);
 	}
 
 	printf("netdumping to %s (%6D)\n", inet_ntoa_r(nd_server, buf),
 	    debugnet_get_gw_mac(pcb), ":");
 	nd_conf.nd_pcb = pcb;
 
 	/* Send the key before the dump so a partial dump is still usable. */
 	if (keysize > 0) {
 		if (keysize > sizeof(nd_buf)) {
 			printf("crypto key is too large (%u)\n", keysize);
 			error = EINVAL;
 			goto out;
 		}
 		memcpy(nd_buf, key, keysize);
 		error = debugnet_send(pcb, NETDUMP_EKCD_KEY, nd_buf, keysize,
 		    NULL);
 		if (error != 0) {
 			printf("error %d sending crypto key\n", error);
 			goto out;
 		}
 	}
 
 out:
 	if (error != 0) {
 		/* As above, squash errors. */
 		error = EINVAL;
 		netdump_cleanup();
 	}
 	return (error);
 }
 
 static int
 netdump_write_headers(struct dumperinfo *di, struct kerneldumpheader *kdh)
 {
 	int error;
 
 	error = netdump_flush_buf();
 	if (error != 0)
 		goto out;
 	memcpy(nd_buf, kdh, sizeof(*kdh));
 	error = debugnet_send(nd_conf.nd_pcb, NETDUMP_KDH, nd_buf,
 	    sizeof(*kdh), NULL);
 out:
 	if (error != 0)
 		netdump_cleanup();
 	return (error);
 }
 
 /*
  * Cleanup routine for a possibly failed netdump.
  */
 static void
 netdump_cleanup(void)
 {
 	if (nd_conf.nd_pcb != NULL) {
 		debugnet_free(nd_conf.nd_pcb);
 		nd_conf.nd_pcb = NULL;
 	}
 }
 
 /*-
  * KLD specific code.
  */
 
 static struct cdevsw netdump_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	netdump_ioctl,
 	.d_name =	"netdump",
 };
 
 static struct cdev *netdump_cdev;
 
 static void
 netdump_unconfigure(void)
 {
 	struct diocskerneldump_arg kda;
 
 	NETDUMP_ASSERT_WLOCKED();
 	KASSERT(netdump_enabled(), ("%s: not enabled", __func__));
 
 	bzero(&kda, sizeof(kda));
 	kda.kda_index = KDA_REMOVE_DEV;
 	(void)dumper_remove(nd_conf.ndc_iface, &kda);
 
 	if (nd_ifp != NULL)
 		if_rele(nd_ifp);
 	nd_ifp = NULL;
 	netdump_set_enabled(false);
 
 	log(LOG_WARNING, "netdump: Lost configured interface %s\n",
 	    nd_conf.ndc_iface);
 
 	bzero(&nd_conf, sizeof(nd_conf));
 }
 
 static void
 netdump_ifdetach(void *arg __unused, struct ifnet *ifp)
 {
 
 	NETDUMP_WLOCK();
 	if (ifp == nd_ifp)
 		netdump_unconfigure();
 	NETDUMP_WUNLOCK();
 }
 
 /*
  * td of NULL is a sentinel value that indicates a kernel caller (ddb(4) or
  * modload-based tunable parameters).
  */
 static int
 netdump_configure(struct diocskerneldump_arg *conf, struct thread *td)
 {
 	struct ifnet *ifp;
 
 	NETDUMP_ASSERT_WLOCKED();
 
 	if (conf->kda_iface[0] != 0) {
 		if (td != NULL && !IS_DEFAULT_VNET(TD_TO_VNET(td)))
 			return (EINVAL);
 		CURVNET_SET(vnet0);
 		ifp = ifunit_ref(conf->kda_iface);
 		CURVNET_RESTORE();
 		if (!DEBUGNET_SUPPORTED_NIC(ifp)) {
 			if_rele(ifp);
 			return (ENODEV);
 		}
 	} else
 		ifp = NULL;
 
 	if (nd_ifp != NULL)
 		if_rele(nd_ifp);
 	nd_ifp = ifp;
 	netdump_set_enabled(true);
 
 #define COPY_SIZED(elm) do {	\
 	_Static_assert(sizeof(nd_conf.ndc_ ## elm) ==			\
 	    sizeof(conf->kda_ ## elm), "elm " __XSTRING(elm) " mismatch"); \
 	memcpy(&nd_conf.ndc_ ## elm, &conf->kda_ ## elm,		\
 	    sizeof(nd_conf.ndc_ ## elm));				\
 } while (0)
 	COPY_SIZED(iface);
 	COPY_SIZED(server);
 	COPY_SIZED(client);
 	COPY_SIZED(gateway);
 	COPY_SIZED(af);
 #undef COPY_SIZED
 
 	return (0);
 }
 
 /*
  * ioctl(2) handler for the netdump device. This is currently only used to
  * register netdump as a dump device.
  *
  * Parameters:
  *     dev, Unused.
  *     cmd, The ioctl to be handled.
  *     addr, The parameter for the ioctl.
  *     flags, Unused.
  *     td, The thread invoking this ioctl.
  *
  * Returns:
  *     0 on success, and an errno value on failure.
  */
 static int
 netdump_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t addr,
     int flags __unused, struct thread *td)
 {
 	struct diocskerneldump_arg *conf;
 	struct dumperinfo dumper;
 	uint8_t *encryptedkey;
 	int error;
 
 	conf = NULL;
 	error = 0;
 	NETDUMP_WLOCK();
 
 	switch (cmd) {
 	case DIOCGKERNELDUMP:
 		conf = (void *)addr;
 		/*
 		 * For now, index is ignored; netdump doesn't support multiple
 		 * configurations (yet).
 		 */
 		if (!netdump_enabled()) {
 			error = ENXIO;
 			conf = NULL;
 			break;
 		}
 
 		if (nd_ifp != NULL)
 			strlcpy(conf->kda_iface, nd_ifp->if_xname,
 			    sizeof(conf->kda_iface));
 		memcpy(&conf->kda_server, &nd_server, sizeof(nd_server));
 		memcpy(&conf->kda_client, &nd_client, sizeof(nd_client));
 		memcpy(&conf->kda_gateway, &nd_gateway, sizeof(nd_gateway));
 		conf->kda_af = nd_conf.ndc_af;
 		conf = NULL;
 		break;
 	case DIOCSKERNELDUMP:
 		encryptedkey = NULL;
 		conf = (void *)addr;
 
 		/* Netdump only supports IP4 at this time. */
 		if (conf->kda_af != AF_INET) {
 			error = EPROTONOSUPPORT;
 			break;
 		}
 
 		conf->kda_iface[sizeof(conf->kda_iface) - 1] = '\0';
 		if (conf->kda_index == KDA_REMOVE ||
 		    conf->kda_index == KDA_REMOVE_DEV ||
 		    conf->kda_index == KDA_REMOVE_ALL) {
 			if (netdump_enabled())
 				netdump_unconfigure();
 			if (conf->kda_index == KDA_REMOVE_ALL)
 				error = dumper_remove(NULL, conf);
 			break;
 		}
 
 		error = netdump_configure(conf, td);
 		if (error != 0)
 			break;
 
 		if (conf->kda_encryption != KERNELDUMP_ENC_NONE) {
 			if (conf->kda_encryptedkeysize <= 0 ||
 			    conf->kda_encryptedkeysize >
 			    KERNELDUMP_ENCKEY_MAX_SIZE) {
 				error = EINVAL;
 				break;
 			}
 			encryptedkey = malloc(conf->kda_encryptedkeysize,
 			    M_TEMP, M_WAITOK);
 			error = copyin(conf->kda_encryptedkey, encryptedkey,
 			    conf->kda_encryptedkeysize);
 			if (error != 0) {
 				free(encryptedkey, M_TEMP);
 				break;
 			}
 
 			conf->kda_encryptedkey = encryptedkey;
 		}
 
 		memset(&dumper, 0, sizeof(dumper));
 		dumper.dumper_start = netdump_start;
 		dumper.dumper_hdr = netdump_write_headers;
 		dumper.dumper = netdump_dumper;
 		dumper.priv = NULL;
 		dumper.blocksize = NETDUMP_DATASIZE;
 		dumper.maxiosize = MAXDUMPPGS * PAGE_SIZE;
 		dumper.mediaoffset = 0;
 		dumper.mediasize = 0;
 
 		error = dumper_insert(&dumper, conf->kda_iface, conf);
 		zfree(encryptedkey, M_TEMP);
 		if (error != 0)
 			netdump_unconfigure();
 		break;
 	default:
 		error = ENOTTY;
 		break;
 	}
 	if (conf != NULL)
 		explicit_bzero(conf, sizeof(*conf));
 	NETDUMP_WUNLOCK();
 	return (error);
 }
 
 /*
  * Called upon system init or kld load.  Initializes the netdump parameters to
  * sane defaults (locates the first available NIC and uses the first IPv4 IP on
  * that card as the client IP).  Leaves the server IP unconfigured.
  *
  * Parameters:
  *	mod, Unused.
  *	what, The module event type.
  *	priv, Unused.
  *
  * Returns:
  *	int, An errno value if an error occurred, 0 otherwise.
  */
 static int
 netdump_modevent(module_t mod __unused, int what, void *priv __unused)
 {
 	struct diocskerneldump_arg conf;
 	char *arg;
 	int error;
 
 	error = 0;
 	switch (what) {
 	case MOD_LOAD:
 		error = make_dev_p(MAKEDEV_WAITOK, &netdump_cdev,
 		    &netdump_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "netdump");
 		if (error != 0)
 			return (error);
 
 		nd_detach_cookie = EVENTHANDLER_REGISTER(ifnet_departure_event,
 		    netdump_ifdetach, NULL, EVENTHANDLER_PRI_ANY);
 
 		if ((arg = kern_getenv("net.dump.iface")) != NULL) {
 			strlcpy(conf.kda_iface, arg, sizeof(conf.kda_iface));
 			freeenv(arg);
 
 			if ((arg = kern_getenv("net.dump.server")) != NULL) {
 				inet_aton(arg, &conf.kda_server.in4);
 				freeenv(arg);
 			}
 			if ((arg = kern_getenv("net.dump.client")) != NULL) {
 				inet_aton(arg, &conf.kda_client.in4);
 				freeenv(arg);
 			}
 			if ((arg = kern_getenv("net.dump.gateway")) != NULL) {
 				inet_aton(arg, &conf.kda_gateway.in4);
 				freeenv(arg);
 			}
 			conf.kda_af = AF_INET;
 
 			/* Ignore errors; we print a message to the console. */
 			NETDUMP_WLOCK();
 			(void)netdump_configure(&conf, NULL);
 			NETDUMP_WUNLOCK();
 		}
 		break;
 	case MOD_UNLOAD:
 		NETDUMP_WLOCK();
 		if (netdump_enabled()) {
 			printf("netdump: disabling dump device for unload\n");
 			netdump_unconfigure();
 		}
 		NETDUMP_WUNLOCK();
 		destroy_dev(netdump_cdev);
 		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 		    nd_detach_cookie);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t netdump_mod = {
 	"netdump",
 	netdump_modevent,
 	NULL,
 };
 
 MODULE_VERSION(netdump, 1);
 DECLARE_MODULE(netdump, netdump_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 
 #ifdef DDB
 /*
  * Usage: netdump -s <server> [-g <gateway] -c <localip> -i <interface>
  *
  * Order is not significant.
  *
  * Currently, this command does not support configuring encryption or
  * compression.
  */
 DB_COMMAND_FLAGS(netdump, db_netdump_cmd, CS_OWN)
 {
 	static struct diocskerneldump_arg conf;
 	static char blockbuf[NETDUMP_DATASIZE];
 	static union {
 		struct dumperinfo di;
 		/* For valid di_devname. */
 		char di_buf[sizeof(struct dumperinfo) + 1];
 	} u;
 
 	struct debugnet_ddb_config params;
 	int error;
 
 	error = debugnet_parse_ddb_cmd("netdump", &params);
 	if (error != 0) {
 		db_printf("Error configuring netdump: %d\n", error);
 		return;
 	}
 
 	/* Translate to a netdump dumper config. */
 	memset(&conf, 0, sizeof(conf));
 
 	if (params.dd_ifp != NULL)
 		strlcpy(conf.kda_iface, if_name(params.dd_ifp),
 		    sizeof(conf.kda_iface));
 
 	conf.kda_af = AF_INET;
 	conf.kda_server.in4 = (struct in_addr) { params.dd_server };
 	if (params.dd_has_client)
 		conf.kda_client.in4 = (struct in_addr) { params.dd_client };
 	else
 		conf.kda_client.in4 = (struct in_addr) { INADDR_ANY };
 	if (params.dd_has_gateway)
 		conf.kda_gateway.in4 = (struct in_addr) { params.dd_gateway };
 	else
 		conf.kda_gateway.in4 = (struct in_addr) { INADDR_ANY };
 
 	/* Set the global netdump config to these options. */
 	error = netdump_configure(&conf, NULL);
 	if (error != 0) {
 		db_printf("Error enabling netdump: %d\n", error);
 		return;
 	}
 
 	/* Fake the generic dump configuration list entry to avoid malloc. */
 	memset(&u.di_buf, 0, sizeof(u.di_buf));
 	u.di.dumper_start = netdump_start;
 	u.di.dumper_hdr = netdump_write_headers;
 	u.di.dumper = netdump_dumper;
 	u.di.priv = NULL;
 	u.di.blocksize = NETDUMP_DATASIZE;
 	u.di.maxiosize = MAXDUMPPGS * PAGE_SIZE;
 	u.di.mediaoffset = 0;
 	u.di.mediasize = 0;
 	u.di.blockbuf = blockbuf;
 
 	dumper_ddb_insert(&u.di);
 
 	error = doadump(false);
 
 	dumper_ddb_remove(&u.di);
 	if (error != 0)
 		db_printf("Cannot dump: %d\n", error);
 }
 #endif /* DDB */
diff --git a/sys/netinet/sctp_os_bsd.h b/sys/netinet/sctp_os_bsd.h
index d9b5c140f14c..80c187b3c5ef 100644
--- a/sys/netinet/sctp_os_bsd.h
+++ b/sys/netinet/sctp_os_bsd.h
@@ -1,488 +1,489 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2006-2007, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *   this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *   the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _NETINET_SCTP_OS_BSD_H_
 #define _NETINET_SCTP_OS_BSD_H_
 /*
  * includes
  */
 #include "opt_inet6.h"
 #include "opt_inet.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/ktr.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/jail.h>
 #include <sys/sysctl.h>
 #include <sys/resourcevar.h>
 #include <sys/uio.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/kthread.h>
 #include <sys/priv.h>
 #include <sys/random.h>
 #include <sys/limits.h>
 #include <sys/queue.h>
 #include <machine/cpu.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/nd6.h>
 #include <netinet6/scope6_var.h>
 #endif				/* INET6 */
 
 #include <netinet/ip_options.h>
 
 #include <crypto/sha1.h>
 #include <crypto/sha2/sha256.h>
 
 /* Declare all the malloc names for all the various mallocs */
 MALLOC_DECLARE(SCTP_M_MAP);
 MALLOC_DECLARE(SCTP_M_STRMI);
 MALLOC_DECLARE(SCTP_M_STRMO);
 MALLOC_DECLARE(SCTP_M_ASC_ADDR);
 MALLOC_DECLARE(SCTP_M_ASC_IT);
 MALLOC_DECLARE(SCTP_M_AUTH_CL);
 MALLOC_DECLARE(SCTP_M_AUTH_KY);
 MALLOC_DECLARE(SCTP_M_AUTH_HL);
 MALLOC_DECLARE(SCTP_M_AUTH_IF);
 MALLOC_DECLARE(SCTP_M_STRESET);
 MALLOC_DECLARE(SCTP_M_CMSG);
 MALLOC_DECLARE(SCTP_M_COPYAL);
 MALLOC_DECLARE(SCTP_M_VRF);
 MALLOC_DECLARE(SCTP_M_IFA);
 MALLOC_DECLARE(SCTP_M_IFN);
 MALLOC_DECLARE(SCTP_M_TIMW);
 MALLOC_DECLARE(SCTP_M_MVRF);
 MALLOC_DECLARE(SCTP_M_ITER);
 MALLOC_DECLARE(SCTP_M_SOCKOPT);
 MALLOC_DECLARE(SCTP_M_MCORE);
 
 #if defined(SCTP_LOCAL_TRACE_BUF)
 
 #define SCTP_GET_CYCLECOUNT get_cyclecount()
 #define SCTP_CTR6 sctp_log_trace
 
 #else
 #define SCTP_CTR6 CTR6
 #endif
 
 /*
  * Macros to expand out globals defined by various modules
  * to either a real global or a virtualized instance of one,
  * depending on whether VIMAGE is defined.
  */
 /* then define the macro(s) that hook into the vimage macros */
 #define MODULE_GLOBAL(__SYMBOL) V_##__SYMBOL
 
 #define V_system_base_info VNET(system_base_info)
 #define SCTP_BASE_INFO(__m) V_system_base_info.sctppcbinfo.__m
 #define SCTP_BASE_STATS V_system_base_info.sctpstat
 #define SCTP_BASE_STAT(__m) V_system_base_info.sctpstat.__m
 #define SCTP_BASE_SYSCTL(__m) V_system_base_info.sctpsysctl.__m
 #define SCTP_BASE_VAR(__m) V_system_base_info.__m
 
 #define SCTP_PRINTF(params...)	printf(params)
 #if defined(SCTP_DEBUG)
 #define SCTPDBG(level, params...)					\
 {									\
 	do {								\
 		if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) {		\
 			SCTP_PRINTF(params);				\
 		}							\
 	} while (0);							\
 }
 #define SCTPDBG_ADDR(level, addr)					\
 {									\
 	do {								\
 		if (SCTP_BASE_SYSCTL(sctp_debug_on) & level ) {		\
 			sctp_print_address(addr);			\
 		}							\
 	} while (0);							\
 }
 #else
 #define SCTPDBG(level, params...)
 #define SCTPDBG_ADDR(level, addr)
 #endif
 
 #ifdef SCTP_LTRACE_CHUNKS
 #define SCTP_LTRACE_CHK(a, b, c, d) if(SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_CHUNK_ENABLE) SCTP_CTR6(KTR_SUBSYS, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_CHUNK_PROC, 0, a, b, c, d)
 #else
 #define SCTP_LTRACE_CHK(a, b, c, d)
 #endif
 
 #ifdef SCTP_LTRACE_ERRORS
 #define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err) \
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
 		SCTP_PRINTF("mbuf:%p inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
 		            m, inp, stcb, net, file, __LINE__, err);
 #define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err) \
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LTRACE_ERROR_ENABLE) \
 		SCTP_PRINTF("inp:%p stcb:%p net:%p file:%x line:%d error:%d\n", \
 		            inp, stcb, net, file, __LINE__, err);
 #else
 #define SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, file, err)
 #define SCTP_LTRACE_ERR_RET(inp, stcb, net, file, err)
 #endif
 
 /*
  * Local address and interface list handling
  */
 #define SCTP_MAX_VRF_ID		0
 #define SCTP_SIZE_OF_VRF_HASH	3
 #define SCTP_IFNAMSIZ		IFNAMSIZ
 #define SCTP_DEFAULT_VRFID	0
 #define SCTP_VRF_ADDR_HASH_SIZE	16
 #define SCTP_VRF_IFN_HASH_SIZE	3
 #define	SCTP_INIT_VRF_TABLEID(vrf)
 
 #define SCTP_IFN_IS_IFT_LOOP(ifn) ((ifn)->ifn_type == IFT_LOOP)
 #define SCTP_ROUTE_IS_REAL_LOOP(ro) ((ro)->ro_nh && (ro)->ro_nh->nh_ifa && (ro)->ro_nh->nh_ifa->ifa_ifp && (ro)->ro_nh->nh_ifa->ifa_ifp->if_type == IFT_LOOP)
 
 /*
  * Access to IFN's to help with src-addr-selection
  */
 /* This could return VOID if the index works but for BSD we provide both. */
 #define SCTP_GET_IFN_VOID_FROM_ROUTE(ro) (void *)ro->ro_nh->nh_ifp
 #define SCTP_GET_IF_INDEX_FROM_ROUTE(ro) (ro)->ro_nh->nh_ifp->if_index
 #define SCTP_ROUTE_HAS_VALID_IFN(ro) ((ro)->ro_nh && (ro)->ro_nh->nh_ifp)
 
 /*
  * general memory allocation
  */
 #define SCTP_MALLOC(var, type, size, name) \
 	do { \
 		var = (type)malloc(size, name, M_NOWAIT); \
 	} while (0)
 
 #define SCTP_FREE(var, type)	free(var, type)
 
 #define SCTP_MALLOC_SONAME(var, type, size) \
 	do { \
 		var = (type)malloc(size, M_SONAME, M_WAITOK | M_ZERO); \
 	} while (0)
 
 #define SCTP_FREE_SONAME(var)	free(var, M_SONAME)
 
 #define SCTP_PROCESS_STRUCT struct proc *
 
 /*
  * zone allocation functions
  */
 #include <vm/uma.h>
 
 /* SCTP_ZONE_INIT: initialize the zone */
 typedef struct uma_zone *sctp_zone_t;
 #define SCTP_ZONE_INIT(zone, name, size, number) { \
 	zone = uma_zcreate(name, size, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,\
 		0); \
 	uma_zone_set_max(zone, number); \
 }
 
 #define SCTP_ZONE_DESTROY(zone) uma_zdestroy(zone)
 
 /* SCTP_ZONE_GET: allocate element from the zone */
 #define SCTP_ZONE_GET(zone, type) \
 	(type *)uma_zalloc(zone, M_NOWAIT);
 
 /* SCTP_ZONE_FREE: free element from the zone */
 #define SCTP_ZONE_FREE(zone, element) \
 	uma_zfree(zone, element);
 
 #define SCTP_HASH_INIT(size, hashmark) hashinit_flags(size, M_PCB, hashmark, HASH_NOWAIT)
 #define SCTP_HASH_FREE(table, hashmark) hashdestroy(table, M_PCB, hashmark)
 
 #define SCTP_M_COPYM	m_copym
 
 /*
  * timers
  */
 #include <sys/callout.h>
 typedef struct callout sctp_os_timer_t;
 
 #define SCTP_OS_TIMER_INIT(tmr)	callout_init(tmr, 1)
 /*
  * NOTE: The next two shouldn't be called directly outside of sctp_timer_start()
  * and sctp_timer_stop(), since they don't handle incrementing/decrementing
  * relevant reference counts.
  */
 #define SCTP_OS_TIMER_START		callout_reset
 #define SCTP_OS_TIMER_STOP		callout_stop
 #define SCTP_OS_TIMER_STOP_DRAIN	callout_drain
 #define SCTP_OS_TIMER_PENDING		callout_pending
 #define SCTP_OS_TIMER_ACTIVE		callout_active
 #define SCTP_OS_TIMER_DEACTIVATE	callout_deactivate
 
 #define sctp_get_tick_count() (ticks)
 
 #define SCTP_UNUSED __attribute__((unused))
 
 /*
  * Functions
  */
 /* Mbuf manipulation and access macros  */
 #define SCTP_BUF_LEN(m) (m->m_len)
 #define SCTP_BUF_NEXT(m) (m->m_next)
 #define SCTP_BUF_NEXT_PKT(m) (m->m_nextpkt)
 #define SCTP_BUF_RESV_UF(m, size) m->m_data += size
 #define SCTP_BUF_AT(m, size) m->m_data + size
 #define SCTP_BUF_IS_EXTENDED(m) (m->m_flags & M_EXT)
 #define SCTP_BUF_SIZE M_SIZE
 #define SCTP_BUF_TYPE(m) (m->m_type)
 #define SCTP_BUF_RECVIF(m) (m->m_pkthdr.rcvif)
 #define SCTP_BUF_PREPEND	M_PREPEND
 
 #define SCTP_ALIGN_TO_END(m, len) M_ALIGN(m, len)
 
 #define SCTP_SNPRINTF(...) snprintf(__VA_ARGS__)
 
 /* We make it so if you have up to 4 threads
  * writing based on the default size of
  * the packet log 65 k, that would be
  * 4 16k packets before we would hit
  * a problem.
  */
 #define SCTP_PKTLOG_WRITERS_NEED_LOCK 3
 
 /*************************/
 /*      MTU              */
 /*************************/
 #define SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index) ((ifn != NULL) ? ((struct ifnet *)ifn)->if_mtu : 0)
 #define SCTP_GATHER_MTU_FROM_ROUTE(sctp_ifa, sa, nh) ((uint32_t)((nh != NULL) ? nh->nh_mtu : 0))
 
 /*************************/
 /* These are for logging */
 /*************************/
 /* return the base ext data pointer */
 #define SCTP_BUF_EXTEND_BASE(m) (m->m_ext.ext_buf)
  /* return the refcnt of the data pointer */
 #define SCTP_BUF_EXTEND_REFCNT(m) (*m->m_ext.ext_cnt)
 /* return any buffer related flags, this is
  * used beyond logging for apple only.
  */
 #define SCTP_BUF_GET_FLAGS(m) (m->m_flags)
 
 /* For BSD this just accesses the M_PKTHDR length
  * so it operates on an mbuf with hdr flag. Other
  * O/S's may have separate packet header and mbuf
  * chain pointers.. thus the macro.
  */
 #define SCTP_HEADER_TO_CHAIN(m) (m)
 #define SCTP_DETACH_HEADER_FROM_CHAIN(m)
 #define SCTP_HEADER_LEN(m) ((m)->m_pkthdr.len)
 #define SCTP_GET_HEADER_FOR_OUTPUT(o_pak) 0
 #define SCTP_RELEASE_HEADER(m)
 #define SCTP_RELEASE_PKT(m)	sctp_m_freem(m)
 #define SCTP_ENABLE_UDP_CSUM(m) do { \
 					m->m_pkthdr.csum_flags = CSUM_UDP; \
 					m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); \
 				} while (0)
 
 #define SCTP_GET_PKT_VRFID(m, vrf_id)  ((vrf_id = SCTP_DEFAULT_VRFID) != SCTP_DEFAULT_VRFID)
 
 /* Attach the chain of data into the sendable packet. */
 #define SCTP_ATTACH_CHAIN(pak, m, packet_length) do { \
                                                  pak = m; \
                                                  pak->m_pkthdr.len = packet_length; \
                          } while(0)
 
 /* Other m_pkthdr type things */
 #define SCTP_IS_IT_BROADCAST(dst, m) ((m->m_flags & M_PKTHDR) ? in_broadcast(dst, m->m_pkthdr.rcvif) : 0)
 #define SCTP_IS_IT_LOOPBACK(m) ((m->m_flags & M_PKTHDR) && ((m->m_pkthdr.rcvif == NULL) || (m->m_pkthdr.rcvif->if_type == IFT_LOOP)))
 
 /* This converts any input packet header
  * into the chain of data holders, for BSD
  * its a NOP.
  */
 
 /* get the v6 hop limit */
 #define SCTP_GET_HLIM(inp, ro)	in6_selecthlim(&inp->ip_inp.inp, (ro ? (ro->ro_nh ? (ro->ro_nh->nh_ifp) : (NULL)) : (NULL)));
 
 /* is the endpoint v6only? */
 #define SCTP_IPV6_V6ONLY(sctp_inpcb)	((sctp_inpcb)->ip_inp.inp.inp_flags & IN6P_IPV6_V6ONLY)
 /* is the socket non-blocking? */
 #define SCTP_SO_IS_NBIO(so)	((so)->so_state & SS_NBIO)
 #define SCTP_SET_SO_NBIO(so)	((so)->so_state |= SS_NBIO)
 #define SCTP_CLEAR_SO_NBIO(so)	((so)->so_state &= ~SS_NBIO)
 /* get the socket type */
 #define SCTP_SO_TYPE(so)	((so)->so_type)
 /* Use a macro for renaming sb_cc to sb_acc.
  * Initially sb_ccc was used, but this broke select() when used
  * with SCTP sockets.
  */
 #define sb_cc sb_acc
 /* reserve sb space for a socket */
 #define SCTP_SORESERVE(so, send, recv)	soreserve(so, send, recv)
 /* wakeup a socket */
 #define SCTP_SOWAKEUP(so)	wakeup(&(so)->so_timeo)
 /* number of bytes ready to read */
 #define SCTP_SBAVAIL(sb)	sbavail(sb)
 /* clear the socket buffer state */
 #define SCTP_SB_CLEAR(sb)	\
 	(sb).sb_cc = 0;		\
 	(sb).sb_mb = NULL;	\
 	(sb).sb_mbcnt = 0;
 
 #define SCTP_SB_LIMIT_RCV(so) (SOLISTENING(so) ? so->sol_sbrcv_hiwat : so->so_rcv.sb_hiwat)
 #define SCTP_SB_LIMIT_SND(so) (SOLISTENING(so) ? so->sol_sbsnd_hiwat : so->so_snd.sb_hiwat)
 
 /*
  * routes, output, etc.
  */
 typedef struct route sctp_route_t;
 
 #define SCTP_RTALLOC(ro, vrf_id, fibnum) \
 { \
 	if ((ro)->ro_nh == NULL) { \
 		(ro)->ro_nh = rib_lookup(fibnum, &(ro)->ro_dst, NHR_REF, 0); \
 	} \
 }
 
 /*
  * SCTP protocol specific mbuf flags.
  */
 #define	M_NOTIFICATION		M_PROTO1	/* SCTP notification */
 
 /*
  * IP output routines
  */
 #define SCTP_IP_OUTPUT(result, o_pak, ro, _inp, vrf_id)                      \
 {                                                                            \
 	struct sctp_inpcb *local_inp = _inp;                                 \
 	int o_flgs = IP_RAWOUTPUT;                                           \
 	                                                                     \
 	m_clrprotoflags(o_pak);                                              \
 	if ((local_inp != NULL) && (local_inp->sctp_socket != NULL)) {       \
 		o_flgs |= local_inp->sctp_socket->so_options & SO_DONTROUTE; \
 	}                                                                    \
 	result = ip_output(o_pak, NULL, ro, o_flgs, 0, NULL);                \
 }
 
 #define SCTP_IP6_OUTPUT(result, o_pak, ro, ifp, _inp, vrf_id)                \
 {                                                                            \
 	struct sctp_inpcb *local_inp = _inp;                                 \
 	                                                                     \
 	m_clrprotoflags(o_pak);                                              \
 	if (local_inp != NULL) {                                             \
 		INP_RLOCK(&local_inp->ip_inp.inp);                           \
 		result = ip6_output(o_pak,                                   \
 		                    local_inp->ip_inp.inp.in6p_outputopts,   \
 		                    (ro), 0, 0, ifp, NULL);                  \
 		INP_RUNLOCK(&local_inp->ip_inp.inp);                         \
 	} else {                                                             \
 		result = ip6_output(o_pak, NULL, (ro), 0, 0, ifp, NULL);     \
 	}                                                                    \
 }
 
 struct mbuf *
 sctp_get_mbuf_for_msg(unsigned int space_needed,
     int want_header, int how, int allonebuf, int type);
 
 /*
  * SCTP AUTH
  */
 #define SCTP_READ_RANDOM(buf, len)	arc4rand(buf, len, 0)
 
 /* map standard crypto API names */
 #define SCTP_SHA1_CTX		SHA1_CTX
 #define SCTP_SHA1_INIT		SHA1Init
 #define SCTP_SHA1_UPDATE	SHA1Update
 #define SCTP_SHA1_FINAL(x,y)	SHA1Final((caddr_t)x, y)
 
 #define SCTP_SHA256_CTX		SHA256_CTX
 #define SCTP_SHA256_INIT	SHA256_Init
 #define SCTP_SHA256_UPDATE	SHA256_Update
 #define SCTP_SHA256_FINAL(x,y)	SHA256_Final((caddr_t)x, y)
 
 #define SCTP_DECREMENT_AND_CHECK_REFCOUNT(addr) (atomic_fetchadd_int(addr, -1) == 1)
 #if defined(INVARIANTS)
 #define SCTP_SAVE_ATOMIC_DECREMENT(addr, val) \
 { \
 	int32_t oldval; \
 	oldval = atomic_fetchadd_int(addr, -val); \
 	if (oldval < val) { \
 		panic("Counter goes negative"); \
 	} \
 }
 #else
 #define SCTP_SAVE_ATOMIC_DECREMENT(addr, val) \
 { \
 	int32_t oldval; \
 	oldval = atomic_fetchadd_int(addr, -val); \
 	if (oldval < val) { \
 		*addr = 0; \
 	} \
 }
 #endif
 
 #define SCTP_IS_LISTENING(inp) ((inp->sctp_flags & SCTP_PCB_FLAGS_ACCEPTING) != 0)
 
 int sctp_syscalls_init(void);
 int sctp_syscalls_uninit(void);
 
 #endif
diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c
index 268be18edb37..f44d1a0848c3 100644
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c
@@ -1,2061 +1,2062 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007, Myricom Inc.
  * Copyright (c) 2008, Intel Corporation.
  * Copyright (c) 2012 The FreeBSD Foundation
  * Copyright (c) 2016-2021 Mellanox Technologies.
  * All rights reserved.
  *
  * Portions of this software were developed by Bjoern Zeeb
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockbuf.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/ethernet.h>
 #include <net/bpf.h>
 #include <net/vnet.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/infiniband.h>
 #include <net/if_lagg.h>
 
 #include <netinet/in_systm.h>
 #include <netinet/in.h>
 #include <netinet/ip6.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_lro.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/udp.h>
 #include <netinet6/ip6_var.h>
 
 #include <machine/in_cksum.h>
 
 static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
 
 #define	TCP_LRO_TS_OPTION \
     ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
 	  (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)
 
 static void	tcp_lro_rx_done(struct lro_ctrl *lc);
 static int	tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m,
 		    uint32_t csum, bool use_hash);
 
 #ifdef TCPHPTS
 static bool	do_bpf_strip_and_compress(struct inpcb *, struct lro_ctrl *,
 		struct lro_entry *, struct mbuf **, struct mbuf **, struct mbuf **,
  		bool *, bool, bool, struct ifnet *, bool);
 
 #endif
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro,  CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP LRO");
 
 static long tcplro_stacks_wanting_mbufq;
 counter_u64_t tcp_inp_lro_direct_queue;
 counter_u64_t tcp_inp_lro_wokeup_queue;
 counter_u64_t tcp_inp_lro_compressed;
 counter_u64_t tcp_inp_lro_locks_taken;
 counter_u64_t tcp_extra_mbuf;
 counter_u64_t tcp_would_have_but;
 counter_u64_t tcp_comp_total;
 counter_u64_t tcp_uncomp_total;
 counter_u64_t tcp_bad_csums;
 
 static unsigned	tcp_lro_entries = TCP_LRO_ENTRIES;
 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
     CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
     "default number of LRO entries");
 
 static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH;
 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold,
     CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0,
     "Number of interrupts in a row on the same CPU that will make us declare an 'affinity' cpu?");
 
 static uint32_t tcp_less_accurate_lro_ts = 0;
 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_less_accurate,
     CTLFLAG_MPSAFE, &tcp_less_accurate_lro_ts, 0,
     "Do we trade off efficency by doing less timestamp operations for time accuracy?");
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD,
     &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD,
     &tcp_inp_lro_wokeup_queue, "Number of lro's where we woke up transport via hpts");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, compressed, CTLFLAG_RD,
     &tcp_inp_lro_compressed, "Number of lro's compressed and sent to transport");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lockcnt, CTLFLAG_RD,
     &tcp_inp_lro_locks_taken, "Number of lro's inp_wlocks taken");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, extra_mbuf, CTLFLAG_RD,
     &tcp_extra_mbuf, "Number of times we had an extra compressed ack dropped into the tp");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, would_have_but, CTLFLAG_RD,
     &tcp_would_have_but, "Number of times we would have had an extra compressed, but mget failed");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, with_m_ackcmp, CTLFLAG_RD,
     &tcp_comp_total, "Number of mbufs queued with M_ACKCMP flags set");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, without_m_ackcmp, CTLFLAG_RD,
     &tcp_uncomp_total, "Number of mbufs queued without M_ACKCMP");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, lro_badcsum, CTLFLAG_RD,
     &tcp_bad_csums, "Number of packets that the common code saw with bad csums");
 
 void
 tcp_lro_reg_mbufq(void)
 {
 	atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, 1);
 }
 
 void
 tcp_lro_dereg_mbufq(void)
 {
 	atomic_fetchadd_long(&tcplro_stacks_wanting_mbufq, -1);
 }
 
 static __inline void
 tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket,
     struct lro_entry *le)
 {
 
 	LIST_INSERT_HEAD(&lc->lro_active, le, next);
 	LIST_INSERT_HEAD(bucket, le, hash_next);
 }
 
 static __inline void
 tcp_lro_active_remove(struct lro_entry *le)
 {
 
 	LIST_REMOVE(le, next);		/* active list */
 	LIST_REMOVE(le, hash_next);	/* hash bucket */
 }
 
 int
 tcp_lro_init(struct lro_ctrl *lc)
 {
 	return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0));
 }
 
 int
 tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp,
     unsigned lro_entries, unsigned lro_mbufs)
 {
 	struct lro_entry *le;
 	size_t size;
 	unsigned i, elements;
 
 	lc->lro_bad_csum = 0;
 	lc->lro_queued = 0;
 	lc->lro_flushed = 0;
 	lc->lro_mbuf_count = 0;
 	lc->lro_mbuf_max = lro_mbufs;
 	lc->lro_cnt = lro_entries;
 	lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX;
 	lc->lro_length_lim = TCP_LRO_LENGTH_MAX;
 	lc->ifp = ifp;
 	LIST_INIT(&lc->lro_free);
 	LIST_INIT(&lc->lro_active);
 
 	/* create hash table to accelerate entry lookup */
 	if (lro_entries > lro_mbufs)
 		elements = lro_entries;
 	else
 		elements = lro_mbufs;
 	lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz,
 	    HASH_NOWAIT);
 	if (lc->lro_hash == NULL) {
 		memset(lc, 0, sizeof(*lc));
 		return (ENOMEM);
 	}
 
 	/* compute size to allocate */
 	size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) +
 	    (lro_entries * sizeof(*le));
 	lc->lro_mbuf_data = (struct lro_mbuf_sort *)
 	    malloc(size, M_LRO, M_NOWAIT | M_ZERO);
 
 	/* check for out of memory */
 	if (lc->lro_mbuf_data == NULL) {
 		free(lc->lro_hash, M_LRO);
 		memset(lc, 0, sizeof(*lc));
 		return (ENOMEM);
 	}
 	/* compute offset for LRO entries */
 	le = (struct lro_entry *)
 	    (lc->lro_mbuf_data + lro_mbufs);
 
 	/* setup linked list */
 	for (i = 0; i != lro_entries; i++)
 		LIST_INSERT_HEAD(&lc->lro_free, le + i, next);
 
 	return (0);
 }
 
 struct vxlan_header {
 	uint32_t	vxlh_flags;
 	uint32_t	vxlh_vni;
 };
 
 static inline void *
 tcp_lro_low_level_parser(void *ptr, struct lro_parser *parser, bool update_data, bool is_vxlan, int mlen)
 {
 	const struct ether_vlan_header *eh;
 	void *old;
 	uint16_t eth_type;
 
 	if (update_data)
 		memset(parser, 0, sizeof(*parser));
 
 	old = ptr;
 
 	if (is_vxlan) {
 		const struct vxlan_header *vxh;
 		vxh = ptr;
 		ptr = (uint8_t *)ptr + sizeof(*vxh);
 		if (update_data) {
 			parser->data.vxlan_vni =
 			    vxh->vxlh_vni & htonl(0xffffff00);
 		}
 	}
 
 	eh = ptr;
 	if (__predict_false(eh->evl_encap_proto == htons(ETHERTYPE_VLAN))) {
 		eth_type = eh->evl_proto;
 		if (update_data) {
 			/* strip priority and keep VLAN ID only */
 			parser->data.vlan_id = eh->evl_tag & htons(EVL_VLID_MASK);
 		}
 		/* advance to next header */
 		ptr = (uint8_t *)ptr + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 		mlen -= (ETHER_HDR_LEN  + ETHER_VLAN_ENCAP_LEN);
 	} else {
 		eth_type = eh->evl_encap_proto;
 		/* advance to next header */
 		mlen -= ETHER_HDR_LEN;
 		ptr = (uint8_t *)ptr + ETHER_HDR_LEN;
 	}
 	if (__predict_false(mlen <= 0))
 		return (NULL);
 	switch (eth_type) {
 #ifdef INET
 	case htons(ETHERTYPE_IP):
 		parser->ip4 = ptr;
 		if (__predict_false(mlen < sizeof(struct ip)))
 			return (NULL);
 		/* Ensure there are no IPv4 options. */
 		if ((parser->ip4->ip_hl << 2) != sizeof (*parser->ip4))
 			break;
 		/* .. and the packet is not fragmented. */
 		if (parser->ip4->ip_off & htons(IP_MF|IP_OFFMASK))
 			break;
 		ptr = (uint8_t *)ptr + (parser->ip4->ip_hl << 2);
 		mlen -= sizeof(struct ip);
 		if (update_data) {
 			parser->data.s_addr.v4 = parser->ip4->ip_src;
 			parser->data.d_addr.v4 = parser->ip4->ip_dst;
 		}
 		switch (parser->ip4->ip_p) {
 		case IPPROTO_UDP:
 			if (__predict_false(mlen < sizeof(struct udphdr)))
 				return (NULL);
 			parser->udp = ptr;
 			if (update_data) {
 				parser->data.lro_type = LRO_TYPE_IPV4_UDP;
 				parser->data.s_port = parser->udp->uh_sport;
 				parser->data.d_port = parser->udp->uh_dport;
 			} else {
 				MPASS(parser->data.lro_type == LRO_TYPE_IPV4_UDP);
 			}
 			ptr = ((uint8_t *)ptr + sizeof(*parser->udp));
 			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
 			return (ptr);
 		case IPPROTO_TCP:
 			parser->tcp = ptr;
 			if (__predict_false(mlen < sizeof(struct tcphdr)))
 				return (NULL);
 			if (update_data) {
 				parser->data.lro_type = LRO_TYPE_IPV4_TCP;
 				parser->data.s_port = parser->tcp->th_sport;
 				parser->data.d_port = parser->tcp->th_dport;
 			} else {
 				MPASS(parser->data.lro_type == LRO_TYPE_IPV4_TCP);
 			}
 			if (__predict_false(mlen < (parser->tcp->th_off << 2)))
 				return (NULL);
 			ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2);
 			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
 			return (ptr);
 		default:
 			break;
 		}
 		break;
 #endif
 #ifdef INET6
 	case htons(ETHERTYPE_IPV6):
 		parser->ip6 = ptr;
 		if (__predict_false(mlen < sizeof(struct ip6_hdr)))
 			return (NULL);
 		ptr = (uint8_t *)ptr + sizeof(*parser->ip6);
 		if (update_data) {
 			parser->data.s_addr.v6 = parser->ip6->ip6_src;
 			parser->data.d_addr.v6 = parser->ip6->ip6_dst;
 		}
 		mlen -= sizeof(struct ip6_hdr);
 		switch (parser->ip6->ip6_nxt) {
 		case IPPROTO_UDP:
 			if (__predict_false(mlen < sizeof(struct udphdr)))
 				return (NULL);
 			parser->udp = ptr;
 			if (update_data) {
 				parser->data.lro_type = LRO_TYPE_IPV6_UDP;
 				parser->data.s_port = parser->udp->uh_sport;
 				parser->data.d_port = parser->udp->uh_dport;
 			} else {
 				MPASS(parser->data.lro_type == LRO_TYPE_IPV6_UDP);
 			}
 			ptr = (uint8_t *)ptr + sizeof(*parser->udp);
 			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
 			return (ptr);
 		case IPPROTO_TCP:
 			if (__predict_false(mlen < sizeof(struct tcphdr)))
 				return (NULL);
 			parser->tcp = ptr;
 			if (update_data) {
 				parser->data.lro_type = LRO_TYPE_IPV6_TCP;
 				parser->data.s_port = parser->tcp->th_sport;
 				parser->data.d_port = parser->tcp->th_dport;
 			} else {
 				MPASS(parser->data.lro_type == LRO_TYPE_IPV6_TCP);
 			}
 			if (__predict_false(mlen < (parser->tcp->th_off << 2)))
 				return (NULL);
 			ptr = (uint8_t *)ptr + (parser->tcp->th_off << 2);
 			parser->total_hdr_len = (uint8_t *)ptr - (uint8_t *)old;
 			return (ptr);
 		default:
 			break;
 		}
 		break;
 #endif
 	default:
 		break;
 	}
 	/* Invalid packet - cannot parse */
 	return (NULL);
 }
 
 static const int vxlan_csum = CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
     CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID;
 
 static inline struct lro_parser *
 tcp_lro_parser(struct mbuf *m, struct lro_parser *po, struct lro_parser *pi, bool update_data)
 {
 	void *data_ptr;
 
 	/* Try to parse outer headers first. */
 	data_ptr = tcp_lro_low_level_parser(m->m_data, po, update_data, false, m->m_len);
 	if (data_ptr == NULL || po->total_hdr_len > m->m_len)
 		return (NULL);
 
 	if (update_data) {
 		/* Store VLAN ID, if any. */
 		if (__predict_false(m->m_flags & M_VLANTAG)) {
 			po->data.vlan_id =
 			    htons(m->m_pkthdr.ether_vtag) & htons(EVL_VLID_MASK);
 		}
 		/* Store decrypted flag, if any. */
 		if (__predict_false((m->m_pkthdr.csum_flags &
 		    CSUM_TLS_MASK) == CSUM_TLS_DECRYPTED))
 			po->data.lro_flags |= LRO_FLAG_DECRYPTED;
 	}
 
 	switch (po->data.lro_type) {
 	case LRO_TYPE_IPV4_UDP:
 	case LRO_TYPE_IPV6_UDP:
 		/* Check for VXLAN headers. */
 		if ((m->m_pkthdr.csum_flags & vxlan_csum) != vxlan_csum)
 			break;
 
 		/* Try to parse inner headers. */
 		data_ptr = tcp_lro_low_level_parser(data_ptr, pi, update_data, true,
 						    (m->m_len - ((caddr_t)data_ptr - m->m_data)));
 		if (data_ptr == NULL || (pi->total_hdr_len + po->total_hdr_len) > m->m_len)
 			break;
 
 		/* Verify supported header types. */
 		switch (pi->data.lro_type) {
 		case LRO_TYPE_IPV4_TCP:
 		case LRO_TYPE_IPV6_TCP:
 			return (pi);
 		default:
 			break;
 		}
 		break;
 	case LRO_TYPE_IPV4_TCP:
 	case LRO_TYPE_IPV6_TCP:
 		if (update_data)
 			memset(pi, 0, sizeof(*pi));
 		return (po);
 	default:
 		break;
 	}
 	return (NULL);
 }
 
 static inline int
 tcp_lro_trim_mbuf_chain(struct mbuf *m, const struct lro_parser *po)
 {
 	int len;
 
 	switch (po->data.lro_type) {
 #ifdef INET
 	case LRO_TYPE_IPV4_TCP:
 		len = ((uint8_t *)po->ip4 - (uint8_t *)m->m_data) +
 		    ntohs(po->ip4->ip_len);
 		break;
 #endif
 #ifdef INET6
 	case LRO_TYPE_IPV6_TCP:
 		len = ((uint8_t *)po->ip6 - (uint8_t *)m->m_data) +
 		    ntohs(po->ip6->ip6_plen) + sizeof(*po->ip6);
 		break;
 #endif
 	default:
 		return (TCP_LRO_CANNOT);
 	}
 
 	/*
 	 * If the frame is padded beyond the end of the IP packet,
 	 * then trim the extra bytes off:
 	 */
 	if (__predict_true(m->m_pkthdr.len == len)) {
 		return (0);
 	} else if (m->m_pkthdr.len > len) {
 		m_adj(m, len - m->m_pkthdr.len);
 		return (0);
 	}
 	return (TCP_LRO_CANNOT);
 }
 
 static struct tcphdr *
 tcp_lro_get_th(struct mbuf *m)
 {
 	return ((struct tcphdr *)((uint8_t *)m->m_data + m->m_pkthdr.lro_tcp_h_off));
 }
 
 static void
 lro_free_mbuf_chain(struct mbuf *m)
 {
 	struct mbuf *save;
 
 	while (m) {
 		save = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		m_freem(m);
 		m = save;
 	}
 }
 
 void
 tcp_lro_free(struct lro_ctrl *lc)
 {
 	struct lro_entry *le;
 	unsigned x;
 
 	/* reset LRO free list */
 	LIST_INIT(&lc->lro_free);
 
 	/* free active mbufs, if any */
 	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
 		tcp_lro_active_remove(le);
 		lro_free_mbuf_chain(le->m_head);
 	}
 
 	/* free hash table */
 	free(lc->lro_hash, M_LRO);
 	lc->lro_hash = NULL;
 	lc->lro_hashsz = 0;
 
 	/* free mbuf array, if any */
 	for (x = 0; x != lc->lro_mbuf_count; x++)
 		m_freem(lc->lro_mbuf_data[x].mb);
 	lc->lro_mbuf_count = 0;
 
 	/* free allocated memory, if any */
 	free(lc->lro_mbuf_data, M_LRO);
 	lc->lro_mbuf_data = NULL;
 }
 
 static uint16_t
 tcp_lro_rx_csum_tcphdr(const struct tcphdr *th)
 {
 	const uint16_t *ptr;
 	uint32_t csum;
 	uint16_t len;
 
 	csum = -th->th_sum;	/* exclude checksum field */
 	len = th->th_off;
 	ptr = (const uint16_t *)th;
 	while (len--) {
 		csum += *ptr;
 		ptr++;
 		csum += *ptr;
 		ptr++;
 	}
 	while (csum > 0xffff)
 		csum = (csum >> 16) + (csum & 0xffff);
 
 	return (csum);
 }
 
 static uint16_t
 tcp_lro_rx_csum_data(const struct lro_parser *pa, uint16_t tcp_csum)
 {
 	uint32_t c;
 	uint16_t cs;
 
 	c = tcp_csum;
 
 	switch (pa->data.lro_type) {
 #ifdef INET6
 	case LRO_TYPE_IPV6_TCP:
 		/* Compute full pseudo IPv6 header checksum. */
 		cs = in6_cksum_pseudo(pa->ip6, ntohs(pa->ip6->ip6_plen), pa->ip6->ip6_nxt, 0);
 		break;
 #endif
 #ifdef INET
 	case LRO_TYPE_IPV4_TCP:
 		/* Compute full pseudo IPv4 header checsum. */
 		cs = in_addword(ntohs(pa->ip4->ip_len) - sizeof(*pa->ip4), IPPROTO_TCP);
 		cs = in_pseudo(pa->ip4->ip_src.s_addr, pa->ip4->ip_dst.s_addr, htons(cs));
 		break;
 #endif
 	default:
 		cs = 0;		/* Keep compiler happy. */
 		break;
 	}
 
 	/* Complement checksum. */
 	cs = ~cs;
 	c += cs;
 
 	/* Remove TCP header checksum. */
 	cs = ~tcp_lro_rx_csum_tcphdr(pa->tcp);
 	c += cs;
 
 	/* Compute checksum remainder. */
 	while (c > 0xffff)
 		c = (c >> 16) + (c & 0xffff);
 
 	return (c);
 }
 
 static void
 tcp_lro_rx_done(struct lro_ctrl *lc)
 {
 	struct lro_entry *le;
 
 	while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
 		tcp_lro_active_remove(le);
 		tcp_lro_flush(lc, le);
 	}
 }
 
 static void
 tcp_lro_flush_active(struct lro_ctrl *lc)
 {
 	struct lro_entry *le;
 
 	/*
 	 * Walk through the list of le entries, and
 	 * any one that does have packets flush. This
 	 * is called because we have an inbound packet
 	 * (e.g. SYN) that has to have all others flushed
 	 * in front of it. Note we have to do the remove
 	 * because tcp_lro_flush() assumes that the entry
 	 * is being freed. This is ok it will just get
 	 * reallocated again like it was new.
 	 */
 	LIST_FOREACH(le, &lc->lro_active, next) {
 		if (le->m_head != NULL) {
 			tcp_lro_active_remove(le);
 			tcp_lro_flush(lc, le);
 		}
 	}
 }
 
 void
 tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
 {
 	struct lro_entry *le, *le_tmp;
 	uint64_t now, tov;
 	struct bintime bt;
 
 	NET_EPOCH_ASSERT();
 	if (LIST_EMPTY(&lc->lro_active))
 		return;
 
 	/* get timeout time and current time in ns */
 	binuptime(&bt);
 	now = bintime2ns(&bt);
 	tov = ((timeout->tv_sec * 1000000000) + (timeout->tv_usec * 1000));
 	LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
 		if (now >= (bintime2ns(&le->alloc_time) + tov)) {
 			tcp_lro_active_remove(le);
 			tcp_lro_flush(lc, le);
 		}
 	}
 }
 
 #ifdef INET
 static int
 tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4)
 {
 	uint16_t csum;
 
 	/* Legacy IP has a header checksum that needs to be correct. */
 	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
 		if (__predict_false((m->m_pkthdr.csum_flags & CSUM_IP_VALID) == 0)) {
 			lc->lro_bad_csum++;
 			return (TCP_LRO_CANNOT);
 		}
 	} else {
 		csum = in_cksum_hdr(ip4);
 		if (__predict_false(csum != 0)) {
 			lc->lro_bad_csum++;
 			return (TCP_LRO_CANNOT);
 		}
 	}
 	return (0);
 }
 #endif
 
 #ifdef TCPHPTS
 static void
 tcp_lro_log(struct tcpcb *tp, const struct lro_ctrl *lc,
     const struct lro_entry *le, const struct mbuf *m,
     int frm, int32_t tcp_data_len, uint32_t th_seq,
     uint32_t th_ack, uint16_t th_win)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv, btv;
 		uint32_t cts;
 
 		cts = tcp_get_usecs(&tv);
 		memset(&log, 0, sizeof(union tcp_log_stackspecific));
 		log.u_bbr.flex8 = frm;
 		log.u_bbr.flex1 = tcp_data_len;
 		if (m)
 			log.u_bbr.flex2 = m->m_pkthdr.len;
 		else
 			log.u_bbr.flex2 = 0;
 		if (le->m_head) {
 			log.u_bbr.flex3 = le->m_head->m_pkthdr.lro_nsegs;
 			log.u_bbr.flex4 = le->m_head->m_pkthdr.lro_tcp_d_len;
 			log.u_bbr.flex5 = le->m_head->m_pkthdr.len;
 			log.u_bbr.delRate = le->m_head->m_flags;
 			log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
 		}
 		log.u_bbr.inflight = th_seq;
 		log.u_bbr.delivered = th_ack;
 		log.u_bbr.timeStamp = cts;
 		log.u_bbr.epoch = le->next_seq;
 		log.u_bbr.lt_epoch = le->ack_seq;
 		log.u_bbr.pacing_gain = th_win;
 		log.u_bbr.cwnd_gain = le->window;
 		log.u_bbr.lost = curcpu;
 		log.u_bbr.cur_del_rate = (uintptr_t)m;
 		log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
 		bintime2timeval(&lc->lro_last_queue_time, &btv);
 		log.u_bbr.flex6 = tcp_tv_to_usectick(&btv);
 		log.u_bbr.flex7 = le->compressed;
 		log.u_bbr.pacing_gain = le->uncompressed;
 		if (in_epoch(net_epoch_preempt))
 			log.u_bbr.inhpts = 1;
 		else
 			log.u_bbr.inhpts = 0;
 		TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv,
 		    &tptosocket(tp)->so_snd,
 		    TCP_LOG_LRO, 0, 0, &log, false, &tv);
 	}
 }
 #endif
 
 static inline void
 tcp_lro_assign_and_checksum_16(uint16_t *ptr, uint16_t value, uint16_t *psum)
 {
 	uint32_t csum;
 
 	csum = 0xffff - *ptr + value;
 	while (csum > 0xffff)
 		csum = (csum >> 16) + (csum & 0xffff);
 	*ptr = value;
 	*psum = csum;
 }
 
 static uint16_t
 tcp_lro_update_checksum(const struct lro_parser *pa, const struct lro_entry *le,
     uint16_t payload_len, uint16_t delta_sum)
 {
 	uint32_t csum;
 	uint16_t tlen;
 	uint16_t temp[5] = {};
 
 	switch (pa->data.lro_type) {
 	case LRO_TYPE_IPV4_TCP:
 		/* Compute new IPv4 length. */
 		tlen = (pa->ip4->ip_hl << 2) + (pa->tcp->th_off << 2) + payload_len;
 		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]);
 
 		/* Subtract delta from current IPv4 checksum. */
 		csum = pa->ip4->ip_sum + 0xffff - temp[0];
 		while (csum > 0xffff)
 			csum = (csum >> 16) + (csum & 0xffff);
 		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]);
 		goto update_tcp_header;
 
 	case LRO_TYPE_IPV6_TCP:
 		/* Compute new IPv6 length. */
 		tlen = (pa->tcp->th_off << 2) + payload_len;
 		tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]);
 		goto update_tcp_header;
 
 	case LRO_TYPE_IPV4_UDP:
 		/* Compute new IPv4 length. */
 		tlen = (pa->ip4->ip_hl << 2) + sizeof(*pa->udp) + payload_len;
 		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_len, htons(tlen), &temp[0]);
 
 		/* Subtract delta from current IPv4 checksum. */
 		csum = pa->ip4->ip_sum + 0xffff - temp[0];
 		while (csum > 0xffff)
 			csum = (csum >> 16) + (csum & 0xffff);
 		tcp_lro_assign_and_checksum_16(&pa->ip4->ip_sum, csum, &temp[1]);
 		goto update_udp_header;
 
 	case LRO_TYPE_IPV6_UDP:
 		/* Compute new IPv6 length. */
 		tlen = sizeof(*pa->udp) + payload_len;
 		tcp_lro_assign_and_checksum_16(&pa->ip6->ip6_plen, htons(tlen), &temp[0]);
 		goto update_udp_header;
 
 	default:
 		return (0);
 	}
 
 update_tcp_header:
 	/* Compute current TCP header checksum. */
 	temp[2] = tcp_lro_rx_csum_tcphdr(pa->tcp);
 
 	/* Incorporate the latest ACK into the TCP header. */
 	pa->tcp->th_ack = le->ack_seq;
 	pa->tcp->th_win = le->window;
 
 	/* Incorporate latest timestamp into the TCP header. */
 	if (le->timestamp != 0) {
 		uint32_t *ts_ptr;
 
 		ts_ptr = (uint32_t *)(pa->tcp + 1);
 		ts_ptr[1] = htonl(le->tsval);
 		ts_ptr[2] = le->tsecr;
 	}
 
 	/* Compute new TCP header checksum. */
 	temp[3] = tcp_lro_rx_csum_tcphdr(pa->tcp);
 
 	/* Compute new TCP checksum. */
 	csum = pa->tcp->th_sum + 0xffff - delta_sum +
 	    0xffff - temp[0] + 0xffff - temp[3] + temp[2];
 	while (csum > 0xffff)
 		csum = (csum >> 16) + (csum & 0xffff);
 
 	/* Assign new TCP checksum. */
 	tcp_lro_assign_and_checksum_16(&pa->tcp->th_sum, csum, &temp[4]);
 
 	/* Compute all modififications affecting next checksum. */
 	csum = temp[0] + temp[1] + 0xffff - temp[2] +
 	    temp[3] + temp[4] + delta_sum;
 	while (csum > 0xffff)
 		csum = (csum >> 16) + (csum & 0xffff);
 
 	/* Return delta checksum to next stage, if any. */
 	return (csum);
 
 update_udp_header:
 	tlen = sizeof(*pa->udp) + payload_len;
 	/* Assign new UDP length and compute checksum delta. */
 	tcp_lro_assign_and_checksum_16(&pa->udp->uh_ulen, htons(tlen), &temp[2]);
 
 	/* Check if there is a UDP checksum. */
 	if (__predict_false(pa->udp->uh_sum != 0)) {
 		/* Compute new UDP checksum. */
 		csum = pa->udp->uh_sum + 0xffff - delta_sum +
 		    0xffff - temp[0] + 0xffff - temp[2];
 		while (csum > 0xffff)
 			csum = (csum >> 16) + (csum & 0xffff);
 		/* Assign new UDP checksum. */
 		tcp_lro_assign_and_checksum_16(&pa->udp->uh_sum, csum, &temp[3]);
 	}
 
 	/* Compute all modififications affecting next checksum. */
 	csum = temp[0] + temp[1] + temp[2] + temp[3] + delta_sum;
 	while (csum > 0xffff)
 		csum = (csum >> 16) + (csum & 0xffff);
 
 	/* Return delta checksum to next stage, if any. */
 	return (csum);
 }
 
 static void
 tcp_flush_out_entry(struct lro_ctrl *lc, struct lro_entry *le)
 {
 	/* Check if we need to recompute any checksums. */
 	if (le->needs_merge) {
 		uint16_t csum;
 
 		switch (le->inner.data.lro_type) {
 		case LRO_TYPE_IPV4_TCP:
 			csum = tcp_lro_update_checksum(&le->inner, le,
 			    le->m_head->m_pkthdr.lro_tcp_d_len,
 			    le->m_head->m_pkthdr.lro_tcp_d_csum);
 			csum = tcp_lro_update_checksum(&le->outer, NULL,
 			    le->m_head->m_pkthdr.lro_tcp_d_len +
 			    le->inner.total_hdr_len, csum);
 			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
 			    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
 			le->m_head->m_pkthdr.csum_data = 0xffff;
 			if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED))
 				le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED;
 			break;
 		case LRO_TYPE_IPV6_TCP:
 			csum = tcp_lro_update_checksum(&le->inner, le,
 			    le->m_head->m_pkthdr.lro_tcp_d_len,
 			    le->m_head->m_pkthdr.lro_tcp_d_csum);
 			csum = tcp_lro_update_checksum(&le->outer, NULL,
 			    le->m_head->m_pkthdr.lro_tcp_d_len +
 			    le->inner.total_hdr_len, csum);
 			le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
 			    CSUM_PSEUDO_HDR;
 			le->m_head->m_pkthdr.csum_data = 0xffff;
 			if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED))
 				le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED;
 			break;
 		case LRO_TYPE_NONE:
 			switch (le->outer.data.lro_type) {
 			case LRO_TYPE_IPV4_TCP:
 				csum = tcp_lro_update_checksum(&le->outer, le,
 				    le->m_head->m_pkthdr.lro_tcp_d_len,
 				    le->m_head->m_pkthdr.lro_tcp_d_csum);
 				le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
 				    CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
 				le->m_head->m_pkthdr.csum_data = 0xffff;
 				if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED))
 					le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED;
 				break;
 			case LRO_TYPE_IPV6_TCP:
 				csum = tcp_lro_update_checksum(&le->outer, le,
 				    le->m_head->m_pkthdr.lro_tcp_d_len,
 				    le->m_head->m_pkthdr.lro_tcp_d_csum);
 				le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
 				    CSUM_PSEUDO_HDR;
 				le->m_head->m_pkthdr.csum_data = 0xffff;
 				if (__predict_false(le->outer.data.lro_flags & LRO_FLAG_DECRYPTED))
 					le->m_head->m_pkthdr.csum_flags |= CSUM_TLS_DECRYPTED;
 				break;
 			default:
 				break;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * Break any chain, this is not set to NULL on the singleton
 	 * case m_nextpkt points to m_head. Other case set them
 	 * m_nextpkt to NULL in push_and_replace.
 	 */
 	le->m_head->m_nextpkt = NULL;
 	lc->lro_queued += le->m_head->m_pkthdr.lro_nsegs;
 	(*lc->ifp->if_input)(lc->ifp, le->m_head);
 }
 
 static void
 tcp_set_entry_to_mbuf(struct lro_ctrl *lc, struct lro_entry *le,
     struct mbuf *m, struct tcphdr *th)
 {
 	uint32_t *ts_ptr;
 	uint16_t tcp_data_len;
 	uint16_t tcp_opt_len;
 
 	ts_ptr = (uint32_t *)(th + 1);
 	tcp_opt_len = (th->th_off << 2);
 	tcp_opt_len -= sizeof(*th);
 
 	/* Check if there is a timestamp option. */
 	if (tcp_opt_len == 0 ||
 	    __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA ||
 	    *ts_ptr != TCP_LRO_TS_OPTION)) {
 		/* We failed to find the timestamp option. */
 		le->timestamp = 0;
 	} else {
 		le->timestamp = 1;
 		le->tsval = ntohl(*(ts_ptr + 1));
 		le->tsecr = *(ts_ptr + 2);
 	}
 
 	tcp_data_len = m->m_pkthdr.lro_tcp_d_len;
 
 	/* Pull out TCP sequence numbers and window size. */
 	le->next_seq = ntohl(th->th_seq) + tcp_data_len;
 	le->ack_seq = th->th_ack;
 	le->window = th->th_win;
 	le->flags = tcp_get_flags(th);
 	le->needs_merge = 0;
 
 	/* Setup new data pointers. */
 	le->m_head = m;
 	le->m_tail = m_last(m);
 }
 
 static void
 tcp_push_and_replace(struct lro_ctrl *lc, struct lro_entry *le, struct mbuf *m)
 {
 	struct lro_parser *pa;
 
 	/*
 	 * Push up the stack of the current entry
 	 * and replace it with "m".
 	 */
 	struct mbuf *msave;
 
 	/* Grab off the next and save it */
 	msave = le->m_head->m_nextpkt;
 	le->m_head->m_nextpkt = NULL;
 
 	/* Now push out the old entry */
 	tcp_flush_out_entry(lc, le);
 
 	/* Re-parse new header, should not fail. */
 	pa = tcp_lro_parser(m, &le->outer, &le->inner, false);
 	KASSERT(pa != NULL,
 	    ("tcp_push_and_replace: LRO parser failed on m=%p\n", m));
 
 	/*
 	 * Now to replace the data properly in the entry
 	 * we have to reset the TCP header and
 	 * other fields.
 	 */
 	tcp_set_entry_to_mbuf(lc, le, m, pa->tcp);
 
 	/* Restore the next list */
 	m->m_nextpkt = msave;
 }
 
 static void
 tcp_lro_mbuf_append_pkthdr(struct lro_entry *le, const struct mbuf *p)
 {
 	struct mbuf *m;
 	uint32_t csum;
 
 	m = le->m_head;
 	if (m->m_pkthdr.lro_nsegs == 1) {
 		/* Compute relative checksum. */
 		csum = p->m_pkthdr.lro_tcp_d_csum;
 	} else {
 		/* Merge TCP data checksums. */
 		csum = (uint32_t)m->m_pkthdr.lro_tcp_d_csum +
 		    (uint32_t)p->m_pkthdr.lro_tcp_d_csum;
 		while (csum > 0xffff)
 			csum = (csum >> 16) + (csum & 0xffff);
 	}
 
 	/* Update various counters. */
 	m->m_pkthdr.len += p->m_pkthdr.lro_tcp_d_len;
 	m->m_pkthdr.lro_tcp_d_csum = csum;
 	m->m_pkthdr.lro_tcp_d_len += p->m_pkthdr.lro_tcp_d_len;
 	m->m_pkthdr.lro_nsegs += p->m_pkthdr.lro_nsegs;
 	le->needs_merge = 1;
 }
 
 static void
 tcp_lro_condense(struct lro_ctrl *lc, struct lro_entry *le)
 {
 	/*
 	 * Walk through the mbuf chain we
 	 * have on tap and compress/condense
 	 * as required.
 	 */
 	uint32_t *ts_ptr;
 	struct mbuf *m;
 	struct tcphdr *th;
 	uint32_t tcp_data_len_total;
 	uint32_t tcp_data_seg_total;
 	uint16_t tcp_data_len;
 	uint16_t tcp_opt_len;
 
 	/*
 	 * First we must check the lead (m_head)
 	 * we must make sure that it is *not*
 	 * something that should be sent up
 	 * right away (sack etc).
 	 */
 again:
 	m = le->m_head->m_nextpkt;
 	if (m == NULL) {
 		/* Just one left. */
 		return;
 	}
 
 	th = tcp_lro_get_th(m);
 	tcp_opt_len = (th->th_off << 2);
 	tcp_opt_len -= sizeof(*th);
 	ts_ptr = (uint32_t *)(th + 1);
 
 	if (tcp_opt_len != 0 && __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA ||
 	    *ts_ptr != TCP_LRO_TS_OPTION)) {
 		/*
 		 * Its not the timestamp. We can't
 		 * use this guy as the head.
 		 */
 		le->m_head->m_nextpkt = m->m_nextpkt;
 		tcp_push_and_replace(lc, le, m);
 		goto again;
 	}
 	if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH)) != 0) {
 		/*
 		 * Make sure that previously seen segments/ACKs are delivered
 		 * before this segment, e.g. FIN.
 		 */
 		le->m_head->m_nextpkt = m->m_nextpkt;
 		tcp_push_and_replace(lc, le, m);
 		goto again;
 	}
 	while((m = le->m_head->m_nextpkt) != NULL) {
 		/*
 		 * condense m into le, first
 		 * pull m out of the list.
 		 */
 		le->m_head->m_nextpkt = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		/* Setup my data */
 		tcp_data_len = m->m_pkthdr.lro_tcp_d_len;
 		th = tcp_lro_get_th(m);
 		ts_ptr = (uint32_t *)(th + 1);
 		tcp_opt_len = (th->th_off << 2);
 		tcp_opt_len -= sizeof(*th);
 		tcp_data_len_total = le->m_head->m_pkthdr.lro_tcp_d_len + tcp_data_len;
 		tcp_data_seg_total = le->m_head->m_pkthdr.lro_nsegs + m->m_pkthdr.lro_nsegs;
 
 		if (tcp_data_seg_total >= lc->lro_ackcnt_lim ||
 		    tcp_data_len_total >= lc->lro_length_lim) {
 			/* Flush now if appending will result in overflow. */
 			tcp_push_and_replace(lc, le, m);
 			goto again;
 		}
 		if (tcp_opt_len != 0 &&
 		    __predict_false(tcp_opt_len != TCPOLEN_TSTAMP_APPA ||
 		    *ts_ptr != TCP_LRO_TS_OPTION)) {
 			/*
 			 * Maybe a sack in the new one? We need to
 			 * start all over after flushing the
 			 * current le. We will go up to the beginning
 			 * and flush it (calling the replace again possibly
 			 * or just returning).
 			 */
 			tcp_push_and_replace(lc, le, m);
 			goto again;
 		}
 		if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH)) != 0) {
 			tcp_push_and_replace(lc, le, m);
 			goto again;
 		}
 		if (tcp_opt_len != 0) {
 			uint32_t tsval = ntohl(*(ts_ptr + 1));
 			/* Make sure timestamp values are increasing. */
 			if (TSTMP_GT(le->tsval, tsval))  {
 				tcp_push_and_replace(lc, le, m);
 				goto again;
 			}
 			le->tsval = tsval;
 			le->tsecr = *(ts_ptr + 2);
 		}
 		/* Try to append the new segment. */
 		if (__predict_false(ntohl(th->th_seq) != le->next_seq ||
 				    ((tcp_get_flags(th) & TH_ACK) !=
 				      (le->flags & TH_ACK)) ||
 				    (tcp_data_len == 0 &&
 				     le->ack_seq == th->th_ack &&
 				     le->window == th->th_win))) {
 			/* Out of order packet, non-ACK + ACK or dup ACK. */
 			tcp_push_and_replace(lc, le, m);
 			goto again;
 		}
 		if (tcp_data_len != 0 ||
 		    SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) {
 			le->next_seq += tcp_data_len;
 			le->ack_seq = th->th_ack;
 			le->window = th->th_win;
 			le->needs_merge = 1;
 		} else if (th->th_ack == le->ack_seq) {
 			if (WIN_GT(th->th_win, le->window)) {
 				le->window = th->th_win;
 				le->needs_merge = 1;
 			}
 		}
 
 		if (tcp_data_len == 0) {
 			m_freem(m);
 			continue;
 		}
 
 		/* Merge TCP data checksum and length to head mbuf. */
 		tcp_lro_mbuf_append_pkthdr(le, m);
 
 		/*
 		 * Adjust the mbuf so that m_data points to the first byte of
 		 * the ULP payload.  Adjust the mbuf to avoid complications and
 		 * append new segment to existing mbuf chain.
 		 */
 		m_adj(m, m->m_pkthdr.len - tcp_data_len);
 		m_demote_pkthdr(m);
 		le->m_tail->m_next = m;
 		le->m_tail = m_last(m);
 	}
 }
 
 #ifdef TCPHPTS
 static void
 tcp_queue_pkts(struct inpcb *inp, struct tcpcb *tp, struct lro_entry *le)
 {
 	INP_WLOCK_ASSERT(inp);
 	if (tp->t_in_pkt == NULL) {
 		/* Nothing yet there */
 		tp->t_in_pkt = le->m_head;
 		tp->t_tail_pkt = le->m_last_mbuf;
 	} else {
 		/* Already some there */
 		tp->t_tail_pkt->m_nextpkt = le->m_head;
 		tp->t_tail_pkt = le->m_last_mbuf;
 	}
 	le->m_head = NULL;
 	le->m_last_mbuf = NULL;
 }
 
 static struct mbuf *
 tcp_lro_get_last_if_ackcmp(struct lro_ctrl *lc, struct lro_entry *le,
     struct inpcb *inp, int32_t *new_m, bool can_append_old_cmp)
 {
 	struct tcpcb *tp;
 	struct mbuf *m;
 
 	tp = intotcpcb(inp);
 	if (__predict_false(tp == NULL))
 		return (NULL);
 
 	/* Look at the last mbuf if any in queue */
  	if (can_append_old_cmp) {
 		m = tp->t_tail_pkt;
 		if (m != NULL && (m->m_flags & M_ACKCMP) != 0) {
 			if (M_TRAILINGSPACE(m) >= sizeof(struct tcp_ackent)) {
 				tcp_lro_log(tp, lc, le, NULL, 23, 0, 0, 0, 0);
 				*new_m = 0;
 				counter_u64_add(tcp_extra_mbuf, 1);
 				return (m);
 			} else {
 				/* Mark we ran out of space */
 				inp->inp_flags2 |= INP_MBUF_L_ACKS;
 			}
 		}
 	}
 	/* Decide mbuf size. */
 	tcp_lro_log(tp, lc, le, NULL, 21, 0, 0, 0, 0);
 	if (inp->inp_flags2 & INP_MBUF_L_ACKS)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_ACKCMP | M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 
 	if (__predict_false(m == NULL)) {
 		counter_u64_add(tcp_would_have_but, 1);
 		return (NULL);
 	}
 	counter_u64_add(tcp_comp_total, 1);
  	m->m_pkthdr.rcvif = lc->ifp;
 	m->m_flags |= M_ACKCMP;
 	*new_m = 1;
 	return (m);
 }
 
 static struct inpcb *
 tcp_lro_lookup(struct ifnet *ifp, struct lro_parser *pa)
 {
 	struct inpcb *inp;
 
 	switch (pa->data.lro_type) {
 #ifdef INET6
 	case LRO_TYPE_IPV6_TCP:
 		inp = in6_pcblookup(&V_tcbinfo,
 		    &pa->data.s_addr.v6,
 		    pa->data.s_port,
 		    &pa->data.d_addr.v6,
 		    pa->data.d_port,
 		    INPLOOKUP_WLOCKPCB,
 		    ifp);
 		break;
 #endif
 #ifdef INET
 	case LRO_TYPE_IPV4_TCP:
 		inp = in_pcblookup(&V_tcbinfo,
 		    pa->data.s_addr.v4,
 		    pa->data.s_port,
 		    pa->data.d_addr.v4,
 		    pa->data.d_port,
 		    INPLOOKUP_WLOCKPCB,
 		    ifp);
 		break;
 #endif
 	default:
 		inp = NULL;
 		break;
 	}
 	return (inp);
 }
 
 static inline bool
 tcp_lro_ack_valid(struct mbuf *m, struct tcphdr *th, uint32_t **ppts, bool *other_opts)
 {
 	/*
 	 * This function returns two bits of valuable information.
 	 * a) Is what is present capable of being ack-compressed,
 	 *    we can ack-compress if there is no options or just
 	 *    a timestamp option, and of course the th_flags must
 	 *    be correct as well.
 	 * b) Our other options present such as SACK. This is
 	 *    used to determine if we want to wakeup or not.
 	 */
 	bool ret = true;
 
 	switch (th->th_off << 2) {
 	case (sizeof(*th) + TCPOLEN_TSTAMP_APPA):
 		*ppts = (uint32_t *)(th + 1);
 		/* Check if we have only one timestamp option. */
 		if (**ppts == TCP_LRO_TS_OPTION)
 			*other_opts = false;
 		else {
 			*other_opts = true;
 			ret = false;
 		}
 		break;
 	case (sizeof(*th)):
 		/* No options. */
 		*ppts = NULL;
 		*other_opts = false;
 		break;
 	default:
 		*ppts = NULL;
 		*other_opts = true;
 		ret = false;
 		break;
 	}
 	/* For ACKCMP we only accept ACK, PUSH, ECE and CWR. */
 	if ((tcp_get_flags(th) & ~(TH_ACK | TH_PUSH | TH_ECE | TH_CWR)) != 0)
 		ret = false;
 	/* If it has data on it we cannot compress it */
 	if (m->m_pkthdr.lro_tcp_d_len)
 		ret = false;
 
 	/* ACK flag must be set. */
 	if (!(tcp_get_flags(th) & TH_ACK))
 		ret = false;
 	return (ret);
 }
 
 static int
 tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct mbuf **pp, *cmp, *mv_to;
 	struct ifnet *lagg_ifp;
  	bool bpf_req, lagg_bpf_req, should_wake, can_append_old_cmp;
 
 	/* Check if packet doesn't belongs to our network interface. */
 	if ((tcplro_stacks_wanting_mbufq == 0) ||
 	    (le->outer.data.vlan_id != 0) ||
 	    (le->inner.data.lro_type != LRO_TYPE_NONE))
 		return (TCP_LRO_CANNOT);
 
 #ifdef INET6
 	/*
 	 * Be proactive about unspecified IPv6 address in source. As
 	 * we use all-zero to indicate unbounded/unconnected pcb,
 	 * unspecified IPv6 address can be used to confuse us.
 	 *
 	 * Note that packets with unspecified IPv6 destination is
 	 * already dropped in ip6_input.
 	 */
 	if (__predict_false(le->outer.data.lro_type == LRO_TYPE_IPV6_TCP &&
 	    IN6_IS_ADDR_UNSPECIFIED(&le->outer.data.s_addr.v6)))
 		return (TCP_LRO_CANNOT);
 
 	if (__predict_false(le->inner.data.lro_type == LRO_TYPE_IPV6_TCP &&
 	    IN6_IS_ADDR_UNSPECIFIED(&le->inner.data.s_addr.v6)))
 		return (TCP_LRO_CANNOT);
 #endif
 	/* Lookup inp, if any. */
 	inp = tcp_lro_lookup(lc->ifp,
 	    (le->inner.data.lro_type == LRO_TYPE_NONE) ? &le->outer : &le->inner);
 	if (inp == NULL)
 		return (TCP_LRO_CANNOT);
 
 	counter_u64_add(tcp_inp_lro_locks_taken, 1);
 
 	/* Get TCP control structure. */
 	tp = intotcpcb(inp);
 
 	/* Check if the inp is dead, Jim. */
 	if (tp->t_state == TCPS_TIME_WAIT) {
 		INP_WUNLOCK(inp);
 		return (TCP_LRO_CANNOT);
 	}
 	if ((inp->inp_irq_cpu_set == 0)  && (lc->lro_cpu_is_set == 1)) {
 		inp->inp_irq_cpu = lc->lro_last_cpu;
 		inp->inp_irq_cpu_set = 1;
 	}
 	/* Check if the transport doesn't support the needed optimizations. */
 	if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) {
 		INP_WUNLOCK(inp);
 		return (TCP_LRO_CANNOT);
 	}
 
 	if (inp->inp_flags2 & INP_MBUF_QUEUE_READY)
 		should_wake = false;
 	else
 		should_wake = true;
 	/* Check if packets should be tapped to BPF. */
 	bpf_req = bpf_peers_present(lc->ifp->if_bpf);
 	lagg_bpf_req = false;
 	lagg_ifp = NULL;
 	if (lc->ifp->if_type == IFT_IEEE8023ADLAG ||
 	    lc->ifp->if_type == IFT_INFINIBANDLAG) {
 		struct lagg_port *lp = lc->ifp->if_lagg;
 		struct lagg_softc *sc = lp->lp_softc;
 
 		lagg_ifp = sc->sc_ifp;
 		if (lagg_ifp != NULL)
 			lagg_bpf_req = bpf_peers_present(lagg_ifp->if_bpf);
 	}
 
 	/* Strip and compress all the incoming packets. */
  	can_append_old_cmp = true;
 	cmp = NULL;
 	for (pp = &le->m_head; *pp != NULL; ) {
 		mv_to = NULL;
 		if (do_bpf_strip_and_compress(inp, lc, le, pp,
 			&cmp, &mv_to, &should_wake, bpf_req,
  			lagg_bpf_req, lagg_ifp, can_append_old_cmp) == false) {
 			/* Advance to next mbuf. */
 			pp = &(*pp)->m_nextpkt;
  			/*
  			 * Once we have appended we can't look in the pending
  			 * inbound packets for a compressed ack to append to.
  			 */
  			can_append_old_cmp = false;
  			/*
  			 * Once we append we also need to stop adding to any
  			 * compressed ack we were remembering. A new cmp
  			 * ack will be required.
  			 */
  			cmp = NULL;
  			tcp_lro_log(tp, lc, le, NULL, 25, 0, 0, 0, 0);
 		} else if (mv_to != NULL) {
 			/* We are asked to move pp up */
 			pp = &mv_to->m_nextpkt;
  			tcp_lro_log(tp, lc, le, NULL, 24, 0, 0, 0, 0);
 		} else
  			tcp_lro_log(tp, lc, le, NULL, 26, 0, 0, 0, 0);
 	}
 	/* Update "m_last_mbuf", if any. */
 	if (pp == &le->m_head)
 		le->m_last_mbuf = *pp;
 	else
 		le->m_last_mbuf = __containerof(pp, struct mbuf, m_nextpkt);
 
 	/* Check if any data mbufs left. */
 	if (le->m_head != NULL) {
 		counter_u64_add(tcp_inp_lro_direct_queue, 1);
 		tcp_lro_log(tp, lc, le, NULL, 22, 1, inp->inp_flags2, 0, 1);
 		tcp_queue_pkts(inp, tp, le);
 	}
 	if (should_wake) {
 		/* Wakeup */
 		counter_u64_add(tcp_inp_lro_wokeup_queue, 1);
 		if ((*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0))
 			inp = NULL;
 	}
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 	return (0);	/* Success. */
 }
 #endif
 
 void
 tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
 {
 	/* Only optimise if there are multiple packets waiting. */
 #ifdef TCPHPTS
 	int error;
 #endif
 
 	NET_EPOCH_ASSERT();
 #ifdef TCPHPTS
 	CURVNET_SET(lc->ifp->if_vnet);
 	error = tcp_lro_flush_tcphpts(lc, le);
 	CURVNET_RESTORE();
 	if (error != 0) {
 #endif
 		tcp_lro_condense(lc, le);
 		tcp_flush_out_entry(lc, le);
 #ifdef TCPHPTS
 	}
 #endif
 	lc->lro_flushed++;
 	bzero(le, sizeof(*le));
 	LIST_INSERT_HEAD(&lc->lro_free, le, next);
 }
 
 #ifdef HAVE_INLINE_FLSLL
 #define	tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1))
 #else
 static inline uint64_t
 tcp_lro_msb_64(uint64_t x)
 {
 	x |= (x >> 1);
 	x |= (x >> 2);
 	x |= (x >> 4);
 	x |= (x >> 8);
 	x |= (x >> 16);
 	x |= (x >> 32);
 	return (x & ~(x >> 1));
 }
 #endif
 
 /*
  * The tcp_lro_sort() routine is comparable to qsort(), except it has
  * a worst case complexity limit of O(MIN(N,64)*N), where N is the
  * number of elements to sort and 64 is the number of sequence bits
  * available. The algorithm is bit-slicing the 64-bit sequence number,
  * sorting one bit at a time from the most significant bit until the
  * least significant one, skipping the constant bits. This is
  * typically called a radix sort.
  */
 static void
 tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size)
 {
 	struct lro_mbuf_sort temp;
 	uint64_t ones;
 	uint64_t zeros;
 	uint32_t x;
 	uint32_t y;
 
 repeat:
 	/* for small arrays insertion sort is faster */
 	if (size <= 12) {
 		for (x = 1; x < size; x++) {
 			temp = parray[x];
 			for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--)
 				parray[y] = parray[y - 1];
 			parray[y] = temp;
 		}
 		return;
 	}
 
 	/* compute sequence bits which are constant */
 	ones = 0;
 	zeros = 0;
 	for (x = 0; x != size; x++) {
 		ones |= parray[x].seq;
 		zeros |= ~parray[x].seq;
 	}
 
 	/* compute bits which are not constant into "ones" */
 	ones &= zeros;
 	if (ones == 0)
 		return;
 
 	/* pick the most significant bit which is not constant */
 	ones = tcp_lro_msb_64(ones);
 
 	/*
 	 * Move entries having cleared sequence bits to the beginning
 	 * of the array:
 	 */
 	for (x = y = 0; y != size; y++) {
 		/* skip set bits */
 		if (parray[y].seq & ones)
 			continue;
 		/* swap entries */
 		temp = parray[x];
 		parray[x] = parray[y];
 		parray[y] = temp;
 		x++;
 	}
 
 	KASSERT(x != 0 && x != size, ("Memory is corrupted\n"));
 
 	/* sort zeros */
 	tcp_lro_sort(parray, x);
 
 	/* sort ones */
 	parray += x;
 	size -= x;
 	goto repeat;
 }
 
 void
 tcp_lro_flush_all(struct lro_ctrl *lc)
 {
 	uint64_t seq;
 	uint64_t nseq;
 	unsigned x;
 
 	NET_EPOCH_ASSERT();
 	/* check if no mbufs to flush */
 	if (lc->lro_mbuf_count == 0)
 		goto done;
 	if (lc->lro_cpu_is_set == 0) {
 		if (lc->lro_last_cpu == curcpu) {
 			lc->lro_cnt_of_same_cpu++;
 			/* Have we reached the threshold to declare a cpu? */
 			if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh)
 				lc->lro_cpu_is_set = 1;
 		} else {
 			lc->lro_last_cpu = curcpu;
 			lc->lro_cnt_of_same_cpu = 0;
 		}
 	}
 	CURVNET_SET(lc->ifp->if_vnet);
 
 	/* get current time */
 	binuptime(&lc->lro_last_queue_time);
 
 	/* sort all mbufs according to stream */
 	tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count);
 
 	/* input data into LRO engine, stream by stream */
 	seq = 0;
 	for (x = 0; x != lc->lro_mbuf_count; x++) {
 		struct mbuf *mb;
 
 		/* get mbuf */
 		mb = lc->lro_mbuf_data[x].mb;
 
 		/* get sequence number, masking away the packet index */
 		nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24);
 
 		/* check for new stream */
 		if (seq != nseq) {
 			seq = nseq;
 
 			/* flush active streams */
 			tcp_lro_rx_done(lc);
 		}
 
 		/* add packet to LRO engine */
 		if (tcp_lro_rx_common(lc, mb, 0, false) != 0) {
  			/* Flush anything we have acummulated */
  			tcp_lro_flush_active(lc);
 			/* input packet to network layer */
 			(*lc->ifp->if_input)(lc->ifp, mb);
 			lc->lro_queued++;
 			lc->lro_flushed++;
 		}
 	}
 	CURVNET_RESTORE();
 done:
 	/* flush active streams */
 	tcp_lro_rx_done(lc);
 
 #ifdef TCPHPTS
 	tcp_run_hpts();
 #endif
 	lc->lro_mbuf_count = 0;
 }
 
 #ifdef TCPHPTS
 static void
 build_ack_entry(struct tcp_ackent *ae, struct tcphdr *th, struct mbuf *m,
     uint32_t *ts_ptr, uint16_t iptos)
 {
 	/*
 	 * Given a TCP ACK, summarize it down into the small TCP ACK
 	 * entry.
 	 */
 	ae->timestamp = m->m_pkthdr.rcv_tstmp;
 	ae->flags = 0;
 	if (m->m_flags & M_TSTMP_LRO)
 		ae->flags |= TSTMP_LRO;
 	else if (m->m_flags & M_TSTMP)
 		ae->flags |= TSTMP_HDWR;
 	ae->seq = ntohl(th->th_seq);
 	ae->ack = ntohl(th->th_ack);
 	ae->flags |= tcp_get_flags(th);
 	if (ts_ptr != NULL) {
 		ae->ts_value = ntohl(ts_ptr[1]);
 		ae->ts_echo = ntohl(ts_ptr[2]);
 		ae->flags |= HAS_TSTMP;
 	}
 	ae->win = ntohs(th->th_win);
 	ae->codepoint = iptos;
 }
 
 /*
  * Do BPF tap for either ACK_CMP packets or MBUF QUEUE type packets
  * and strip all, but the IPv4/IPv6 header.
  */
 static bool
 do_bpf_strip_and_compress(struct inpcb *inp, struct lro_ctrl *lc,
     struct lro_entry *le, struct mbuf **pp, struct mbuf **cmp, struct mbuf **mv_to,
     bool *should_wake, bool bpf_req, bool lagg_bpf_req, struct ifnet *lagg_ifp, bool can_append_old_cmp)
 {
 	union {
 		void *ptr;
 		struct ip *ip4;
 		struct ip6_hdr *ip6;
 	} l3;
 	struct mbuf *m;
 	struct mbuf *nm;
 	struct tcphdr *th;
 	struct tcp_ackent *ack_ent;
 	uint32_t *ts_ptr;
 	int32_t n_mbuf;
 	bool other_opts, can_compress;
 	uint8_t lro_type;
 	uint16_t iptos;
 	int tcp_hdr_offset;
 	int idx;
 
 	/* Get current mbuf. */
 	m = *pp;
 
 	/* Let the BPF see the packet */
 	if (__predict_false(bpf_req))
 		ETHER_BPF_MTAP(lc->ifp, m);
 
 	if (__predict_false(lagg_bpf_req))
 		ETHER_BPF_MTAP(lagg_ifp, m);
 
 	tcp_hdr_offset = m->m_pkthdr.lro_tcp_h_off;
 	lro_type = le->inner.data.lro_type;
 	switch (lro_type) {
 	case LRO_TYPE_NONE:
 		lro_type = le->outer.data.lro_type;
 		switch (lro_type) {
 		case LRO_TYPE_IPV4_TCP:
 			tcp_hdr_offset -= sizeof(*le->outer.ip4);
 			m->m_pkthdr.lro_etype = ETHERTYPE_IP;
 			break;
 		case LRO_TYPE_IPV6_TCP:
 			tcp_hdr_offset -= sizeof(*le->outer.ip6);
 			m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
 			break;
 		default:
 			goto compressed;
 		}
 		break;
 	case LRO_TYPE_IPV4_TCP:
 		tcp_hdr_offset -= sizeof(*le->outer.ip4);
 		m->m_pkthdr.lro_etype = ETHERTYPE_IP;
 		break;
 	case LRO_TYPE_IPV6_TCP:
 		tcp_hdr_offset -= sizeof(*le->outer.ip6);
 		m->m_pkthdr.lro_etype = ETHERTYPE_IPV6;
 		break;
 	default:
 		goto compressed;
 	}
 
 	MPASS(tcp_hdr_offset >= 0);
 
 	m_adj(m, tcp_hdr_offset);
 	m->m_flags |= M_LRO_EHDRSTRP;
 	m->m_flags &= ~M_ACKCMP;
 	m->m_pkthdr.lro_tcp_h_off -= tcp_hdr_offset;
 
 	th = tcp_lro_get_th(m);
 
 	th->th_sum = 0;		/* TCP checksum is valid. */
 
 	/* Check if ACK can be compressed */
 	can_compress = tcp_lro_ack_valid(m, th, &ts_ptr, &other_opts);
 
 	/* Now lets look at the should wake states */
 	if ((other_opts == true) &&
 	    ((inp->inp_flags2 & INP_DONT_SACK_QUEUE) == 0)) {
 		/*
 		 * If there are other options (SACK?) and the
 		 * tcp endpoint has not expressly told us it does
 		 * not care about SACKS, then we should wake up.
 		 */
 		*should_wake = true;
 	}
 	/* Is the ack compressable? */
 	if (can_compress == false)
 		goto done;
 	/* Does the TCP endpoint support ACK compression? */
 	if ((inp->inp_flags2 & INP_MBUF_ACKCMP) == 0)
 		goto done;
 
 	/* Lets get the TOS/traffic class field */
 	l3.ptr = mtod(m, void *);
 	switch (lro_type) {
 	case LRO_TYPE_IPV4_TCP:
 		iptos = l3.ip4->ip_tos;
 		break;
 	case LRO_TYPE_IPV6_TCP:
 		iptos = IPV6_TRAFFIC_CLASS(l3.ip6);
 		break;
 	default:
 		iptos = 0;	/* Keep compiler happy. */
 		break;
 	}
 	/* Now lets get space if we don't have some already */
 	if (*cmp == NULL) {
 new_one:
 		nm = tcp_lro_get_last_if_ackcmp(lc, le, inp, &n_mbuf, can_append_old_cmp);
 		if (__predict_false(nm == NULL))
 			goto done;
 		*cmp = nm;
 		if (n_mbuf) {
 			/*
 			 *  Link in the new cmp ack to our in-order place,
 			 * first set our cmp ack's next to where we are.
 			 */
 			nm->m_nextpkt = m;
 			(*pp) = nm;
 			/*
 			 * Set it up so mv_to is advanced to our
 			 * compressed ack. This way the caller can
 			 * advance pp to the right place.
 			 */
 			*mv_to = nm;
 			/*
 			 * Advance it here locally as well.
 			 */
 			pp = &nm->m_nextpkt;
 		}
 	} else {
 		/* We have one already we are working on */
 		nm = *cmp;
 		if (M_TRAILINGSPACE(nm) < sizeof(struct tcp_ackent)) {
 			/* We ran out of space */
 			inp->inp_flags2 |= INP_MBUF_L_ACKS;
 			goto new_one;
 		}
 	}
 	MPASS(M_TRAILINGSPACE(nm) >= sizeof(struct tcp_ackent));
 	counter_u64_add(tcp_inp_lro_compressed, 1);
 	le->compressed++;
 	/* We can add in to the one on the tail */
 	ack_ent = mtod(nm, struct tcp_ackent *);
 	idx = (nm->m_len / sizeof(struct tcp_ackent));
 	build_ack_entry(&ack_ent[idx], th, m, ts_ptr, iptos);
 
 	/* Bump the size of both pkt-hdr and len */
 	nm->m_len += sizeof(struct tcp_ackent);
 	nm->m_pkthdr.len += sizeof(struct tcp_ackent);
 compressed:
 	/* Advance to next mbuf before freeing. */
 	*pp = m->m_nextpkt;
 	m->m_nextpkt = NULL;
 	m_freem(m);
 	return (true);
 done:
 	counter_u64_add(tcp_uncomp_total, 1);
 	le->uncompressed++;
 	return (false);
 }
 #endif
 
 static struct lro_head *
 tcp_lro_rx_get_bucket(struct lro_ctrl *lc, struct mbuf *m, struct lro_parser *parser)
 {
 	u_long hash;
 
 	if (M_HASHTYPE_ISHASH(m)) {
 		hash = m->m_pkthdr.flowid;
 	} else {
 		for (unsigned i = hash = 0; i != LRO_RAW_ADDRESS_MAX; i++)
 			hash += parser->data.raw[i];
 	}
 	return (&lc->lro_hash[hash % lc->lro_hashsz]);
 }
 
 static int
 tcp_lro_rx_common(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, bool use_hash)
 {
 	struct lro_parser pi;	/* inner address data */
 	struct lro_parser po;	/* outer address data */
 	struct lro_parser *pa;	/* current parser for TCP stream */
 	struct lro_entry *le;
 	struct lro_head *bucket;
 	struct tcphdr *th;
 	int tcp_data_len;
 	int tcp_opt_len;
 	int error;
 	uint16_t tcp_data_sum;
 
 #ifdef INET
 	/* Quickly decide if packet cannot be LRO'ed */
 	if (__predict_false(V_ipforwarding != 0))
 		return (TCP_LRO_CANNOT);
 #endif
 #ifdef INET6
 	/* Quickly decide if packet cannot be LRO'ed */
 	if (__predict_false(V_ip6_forwarding != 0))
 		return (TCP_LRO_CANNOT);
 #endif
 	if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) !=
 	     ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || 
 	    (m->m_pkthdr.csum_data != 0xffff)) {
 		/* 
 		 * The checksum either did not have hardware offload
 		 * or it was a bad checksum. We can't LRO such
 		 * a packet.
 		 */
 		counter_u64_add(tcp_bad_csums, 1);
 		return (TCP_LRO_CANNOT);
 	}
 	/* We expect a contiguous header [eh, ip, tcp]. */
 	pa = tcp_lro_parser(m, &po, &pi, true);
 	if (__predict_false(pa == NULL))
 		return (TCP_LRO_NOT_SUPPORTED);
 
 	/* We don't expect any padding. */
 	error = tcp_lro_trim_mbuf_chain(m, pa);
 	if (__predict_false(error != 0))
 		return (error);
 
 #ifdef INET
 	switch (pa->data.lro_type) {
 	case LRO_TYPE_IPV4_TCP:
 		error = tcp_lro_rx_ipv4(lc, m, pa->ip4);
 		if (__predict_false(error != 0))
 			return (error);
 		break;
 	default:
 		break;
 	}
 #endif
 	/* If no hardware or arrival stamp on the packet add timestamp */
 	if ((m->m_flags & (M_TSTMP_LRO | M_TSTMP)) == 0) {
 		m->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); 
 		m->m_flags |= M_TSTMP_LRO;
 	}
 
 	/* Get pointer to TCP header. */
 	th = pa->tcp;
 
 	/* Don't process SYN packets. */
 	if (__predict_false(tcp_get_flags(th) & TH_SYN))
 		return (TCP_LRO_CANNOT);
 
 	/* Get total TCP header length and compute payload length. */
 	tcp_opt_len = (th->th_off << 2);
 	tcp_data_len = m->m_pkthdr.len - ((uint8_t *)th -
 	    (uint8_t *)m->m_data) - tcp_opt_len;
 	tcp_opt_len -= sizeof(*th);
 
 	/* Don't process invalid TCP headers. */
 	if (__predict_false(tcp_opt_len < 0 || tcp_data_len < 0))
 		return (TCP_LRO_CANNOT);
 
 	/* Compute TCP data only checksum. */
 	if (tcp_data_len == 0)
 		tcp_data_sum = 0;	/* no data, no checksum */
 	else if (__predict_false(csum != 0))
 		tcp_data_sum = tcp_lro_rx_csum_data(pa, ~csum);
 	else
 		tcp_data_sum = tcp_lro_rx_csum_data(pa, ~th->th_sum);
 
 	/* Save TCP info in mbuf. */
 	m->m_nextpkt = NULL;
 	m->m_pkthdr.rcvif = lc->ifp;
 	m->m_pkthdr.lro_tcp_d_csum = tcp_data_sum;
 	m->m_pkthdr.lro_tcp_d_len = tcp_data_len;
 	m->m_pkthdr.lro_tcp_h_off = ((uint8_t *)th - (uint8_t *)m->m_data);
 	m->m_pkthdr.lro_nsegs = 1;
 
 	/* Get hash bucket. */
 	if (!use_hash) {
 		bucket = &lc->lro_hash[0];
 	} else {
 		bucket = tcp_lro_rx_get_bucket(lc, m, pa);
 	}
 
 	/* Try to find a matching previous segment. */
 	LIST_FOREACH(le, bucket, hash_next) {
 		/* Compare addresses and ports. */
 		if (lro_address_compare(&po.data, &le->outer.data) == false ||
 		    lro_address_compare(&pi.data, &le->inner.data) == false)
 			continue;
 
 		/* Check if no data and old ACK. */
 		if (tcp_data_len == 0 &&
 		    SEQ_LT(ntohl(th->th_ack), ntohl(le->ack_seq))) {
 			m_freem(m);
 			return (0);
 		}
 
 		/* Mark "m" in the last spot. */
 		le->m_last_mbuf->m_nextpkt = m;
 		/* Now set the tail to "m". */
 		le->m_last_mbuf = m;
 		return (0);
 	}
 
 	/* Try to find an empty slot. */
 	if (LIST_EMPTY(&lc->lro_free))
 		return (TCP_LRO_NO_ENTRIES);
 
 	/* Start a new segment chain. */
 	le = LIST_FIRST(&lc->lro_free);
 	LIST_REMOVE(le, next);
 	tcp_lro_active_insert(lc, bucket, le);
 
 	/* Make sure the headers are set. */
 	le->inner = pi;
 	le->outer = po;
 
 	/* Store time this entry was allocated. */
 	le->alloc_time = lc->lro_last_queue_time;
 
 	tcp_set_entry_to_mbuf(lc, le, m, th);
 
 	/* Now set the tail to "m". */
 	le->m_last_mbuf = m;
 
 	return (0);
 }
 
 int
 tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
 {
 	int error;
 
 	if (((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PSEUDO_HDR)) !=
 	     ((CSUM_DATA_VALID | CSUM_PSEUDO_HDR))) || 
 	    (m->m_pkthdr.csum_data != 0xffff)) {
 		/* 
 		 * The checksum either did not have hardware offload
 		 * or it was a bad checksum. We can't LRO such
 		 * a packet.
 		 */
 		counter_u64_add(tcp_bad_csums, 1);
 		return (TCP_LRO_CANNOT);
 	}
 	/* get current time */
 	binuptime(&lc->lro_last_queue_time);
 	CURVNET_SET(lc->ifp->if_vnet);
 	error = tcp_lro_rx_common(lc, m, csum, true);
 	if (__predict_false(error != 0)) {
 		/*
 		 * Flush anything we have acummulated
 		 * ahead of this packet that can't
 		 * be LRO'd. This preserves order.
 		 */
 		tcp_lro_flush_active(lc);
 	}
 	CURVNET_RESTORE();
 
 	return (error);
 }
 
 void
 tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
 {
 	NET_EPOCH_ASSERT();
 	/* sanity checks */
 	if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL ||
 	    lc->lro_mbuf_max == 0)) {
 		/* packet drop */
 		m_freem(mb);
 		return;
 	}
 
 	/* check if packet is not LRO capable */
 	if (__predict_false((lc->ifp->if_capenable & IFCAP_LRO) == 0)) {
 		/* input packet to network layer */
 		(*lc->ifp->if_input) (lc->ifp, mb);
 		return;
 	}
 
  	/* If no hardware or arrival stamp on the packet add timestamp */
  	if ((tcplro_stacks_wanting_mbufq > 0) &&
  	    (tcp_less_accurate_lro_ts == 0) &&
  	    ((mb->m_flags & M_TSTMP) == 0)) {
  		/* Add in an LRO time since no hardware */
  		binuptime(&lc->lro_last_queue_time);
  		mb->m_pkthdr.rcv_tstmp = bintime2ns(&lc->lro_last_queue_time); 
  		mb->m_flags |= M_TSTMP_LRO;
  	}
 
 	/* create sequence number */
 	lc->lro_mbuf_data[lc->lro_mbuf_count].seq =
 	    (((uint64_t)M_HASHTYPE_GET(mb)) << 56) |
 	    (((uint64_t)mb->m_pkthdr.flowid) << 24) |
 	    ((uint64_t)lc->lro_mbuf_count);
 
 	/* enter mbuf */
 	lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb;
 
 	/* flush if array is full */
 	if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max))
 		tcp_lro_flush_all(lc);
 }
 
 /* end */
diff --git a/sys/netinet/tcp_offload.c b/sys/netinet/tcp_offload.c
index b93d6027a01e..102a4935631c 100644
--- a/sys/netinet/tcp_offload.c
+++ b/sys/netinet/tcp_offload.c
@@ -1,232 +1,233 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockopt.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_fib.h>
 #include <netinet6/in6_fib.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_offload.h>
 #define	TCPOUTFLAGS
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 
 int registered_toedevs;
 
 /*
  * Provide an opportunity for a TOE driver to offload.
  */
 int
 tcp_offload_connect(struct socket *so, struct sockaddr *nam)
 {
 	struct ifnet *ifp;
 	struct toedev *tod;
 	struct nhop_object *nh;
 	struct epoch_tracker et;
 	int error = EOPNOTSUPP;
 
 	INP_WLOCK_ASSERT(sotoinpcb(so));
 	KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6,
 	    ("%s: called with sa_family %d", __func__, nam->sa_family));
 
 	if (registered_toedevs == 0)
 		return (error);
 
 	NET_EPOCH_ENTER(et);
 	nh = NULL;
 #ifdef INET
 	if (nam->sa_family == AF_INET)
 		nh = fib4_lookup(0, ((struct sockaddr_in *)nam)->sin_addr,
 		    NHR_NONE, 0, 0);
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET6
 	if (nam->sa_family == AF_INET6)
 		nh = fib6_lookup(0, &((struct sockaddr_in6 *)nam)->sin6_addr,
 		    NHR_NONE, 0, 0);
 #endif
 	if (nh == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (EHOSTUNREACH);
 	}
 
 	ifp = nh->nh_ifp;
 
 	if (nam->sa_family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4))
 		goto done;
 	if (nam->sa_family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6))
 		goto done;
 
 	tod = TOEDEV(ifp);
 	if (tod != NULL)
 		error = tod->tod_connect(tod, so, nh, nam);
 done:
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 void
 tcp_offload_listen_start(struct tcpcb *tp)
 {
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	EVENTHANDLER_INVOKE(tcp_offload_listen_start, tp);
 }
 
 void
 tcp_offload_listen_stop(struct tcpcb *tp)
 {
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	EVENTHANDLER_INVOKE(tcp_offload_listen_stop, tp);
 }
 
 void
 tcp_offload_input(struct tcpcb *tp, struct mbuf *m)
 {
 	struct toedev *tod = tp->tod;
 
 	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	tod->tod_input(tod, tp, m);
 }
 
 int
 tcp_offload_output(struct tcpcb *tp)
 {
 	struct toedev *tod = tp->tod;
 	int error, flags;
 
 	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	flags = tcp_outflags[tp->t_state];
 
 	if (flags & TH_RST) {
 		/* XXX: avoid repeated calls like we do for FIN */
 		error = tod->tod_send_rst(tod, tp);
 	} else if ((flags & TH_FIN || tp->t_flags & TF_NEEDFIN) &&
 	    (tp->t_flags & TF_SENTFIN) == 0) {
 		error = tod->tod_send_fin(tod, tp);
 		if (error == 0)
 			tp->t_flags |= TF_SENTFIN;
 	} else
 		error = tod->tod_output(tod, tp);
 
 	return (error);
 }
 
 void
 tcp_offload_rcvd(struct tcpcb *tp)
 {
 	struct toedev *tod = tp->tod;
 
 	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	tod->tod_rcvd(tod, tp);
 }
 
 void
 tcp_offload_ctloutput(struct tcpcb *tp, int sopt_dir, int sopt_name)
 {
 	struct toedev *tod = tp->tod;
 
 	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	tod->tod_ctloutput(tod, tp, sopt_dir, sopt_name);
 }
 
 void
 tcp_offload_tcp_info(struct tcpcb *tp, struct tcp_info *ti)
 {
 	struct toedev *tod = tp->tod;
 
 	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	tod->tod_tcp_info(tod, tp, ti);
 }
 
 int
 tcp_offload_alloc_tls_session(struct tcpcb *tp, struct ktls_session *tls,
     int direction)
 {
 	struct toedev *tod = tp->tod;
 
 	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	return (tod->tod_alloc_tls_session(tod, tp, tls, direction));
 }
 
 void
 tcp_offload_detach(struct tcpcb *tp)
 {
 	struct toedev *tod = tp->tod;
 
 	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	tod->tod_pcb_detach(tod, tp);
 }
 
 void
 tcp_offload_pmtu_update(struct tcpcb *tp, tcp_seq seq, int mtu)
 {
 	struct toedev *tod = tp->tod;
 
 	KASSERT(tod != NULL, ("%s: tp->tod is NULL, tp %p", __func__, tp));
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	tod->tod_pmtu_update(tod, tp, seq, mtu);
 }
diff --git a/sys/netinet/tcp_ratelimit.c b/sys/netinet/tcp_ratelimit.c
index ca619e66c07d..82aea5bdf0de 100644
--- a/sys/netinet/tcp_ratelimit.c
+++ b/sys/netinet/tcp_ratelimit.c
@@ -1,1792 +1,1793 @@
 /*-
  *
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2018-2020
  *	Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 /**
  * Author: Randall Stewart <rrs@netflix.com>
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_ratelimit.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/eventhandler.h>
 #include <sys/mutex.h>
 #include <sys/ck.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #define TCPSTATES		/* for logging */
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_ratelimit.h>
 #ifndef USECS_IN_SECOND
 #define USECS_IN_SECOND 1000000
 #endif
 /*
  * For the purposes of each send, what is the size
  * of an ethernet frame.
  */
 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
 #ifdef RATELIMIT
 
 /*
  * The following preferred table will seem weird to
  * the casual viewer. Why do we not have any rates below
  * 1Mbps? Why do we have a rate at 1.44Mbps called common?
  * Why do the rates cluster in the 1-100Mbps range more
  * than others? Why does the table jump around at the beginnign
  * and then be more consistently raising?
  *
  * Let me try to answer those questions. A lot of
  * this is dependant on the hardware. We have three basic
  * supporters of rate limiting
  *
  * Chelsio - Supporting 16 configurable rates.
  * Mlx  - c4 supporting 13 fixed rates.
  * Mlx  - c5 & c6 supporting 127 configurable rates.
  *
  * The c4 is why we have a common rate that is available
  * in all rate tables. This is a selected rate from the
  * c4 table and we assure its available in all ratelimit
  * tables. This way the tcp_ratelimit code has an assured
  * rate it should always be able to get. This answers a
  * couple of the questions above.
  *
  * So what about the rest, well the table is built to
  * try to get the most out of a joint hardware/software
  * pacing system.  The software pacer will always pick
  * a rate higher than the b/w that it is estimating
  *
  * on the path. This is done for two reasons.
  * a) So we can discover more b/w
  * and
  * b) So we can send a block of MSS's down and then
  *    have the software timer go off after the previous
  *    send is completely out of the hardware.
  *
  * But when we do <b> we don't want to have the delay
  * between the last packet sent by the hardware be
  * excessively long (to reach our desired rate).
  *
  * So let me give an example for clarity.
  *
  * Lets assume that the tcp stack sees that 29,110,000 bps is
  * what the bw of the path is. The stack would select the
  * rate 31Mbps. 31Mbps means that each send that is done
  * by the hardware will cause a 390 micro-second gap between
  * the packets sent at that rate. For 29,110,000 bps we
  * would need 416 micro-seconds gap between each send.
  *
  * Note that are calculating a complete time for pacing
  * which includes the ethernet, IP and TCP overhead. So
  * a full 1514 bytes is used for the above calculations.
  * My testing has shown that both cards are also using this
  * as their basis i.e. full payload size of the ethernet frame.
  * The TCP stack caller needs to be aware of this and make the
  * appropriate overhead calculations be included in its choices.
  *
  * Now, continuing our example, we pick a MSS size based on the
  * delta between the two rates (416 - 390) divided into the rate
  * we really wish to send at rounded up.  That results in a MSS
  * send of 17 mss's at once. The hardware then will
  * run out of data in a single 17MSS send in 6,630 micro-seconds.
  *
  * On the other hand the software pacer will send more data
  * in 7,072 micro-seconds. This means that we will refill
  * the hardware 52 microseconds after it would have sent
  * next if it had not ran out of data. This is a win since we are
  * only sending every 7ms or so and yet all the packets are spaced on
  * the wire with 94% of what they should be and only
  * the last packet is delayed extra to make up for the
  * difference.
  *
  * Note that the above formula has two important caveat.
  * If we are above (b/w wise) over 100Mbps we double the result
  * of the MSS calculation. The second caveat is if we are 500Mbps
  * or more we just send the maximum MSS at once i.e. 45MSS. At
  * the higher b/w's even the cards have limits to what times (timer granularity)
  * they can insert between packets and start to send more than one
  * packet at a time on the wire.
  *
  */
 #define COMMON_RATE 180500
 const uint64_t desired_rates[] = {
 	122500,			/* 1Mbps  - rate 1 */
 	180500,			/* 1.44Mpbs - rate 2  common rate */
 	375000,			/* 3Mbps    - rate 3 */
 	625000,			/* 5Mbps    - rate 4 */
 	1250000,		/* 10Mbps   - rate 5 */
 	1875000,		/* 15Mbps   - rate 6 */
 	2500000,		/* 20Mbps   - rate 7 */
 	3125000,	       	/* 25Mbps   - rate 8 */
 	3750000,		/* 30Mbps   - rate 9 */
 	4375000,		/* 35Mbps   - rate 10 */
 	5000000,		/* 40Meg    - rate 11 */
 	6250000,		/* 50Mbps   - rate 12 */
 	12500000,		/* 100Mbps  - rate 13 */
 	25000000,		/* 200Mbps  - rate 14 */
 	50000000,		/* 400Mbps  - rate 15 */
 	100000000,		/* 800Mbps  - rate 16 */
 	5625000,		/* 45Mbps   - rate 17 */
 	6875000,		/* 55Mbps   - rate 19 */
 	7500000,		/* 60Mbps   - rate 20 */
 	8125000,		/* 65Mbps   - rate 21 */
 	8750000,		/* 70Mbps   - rate 22 */
 	9375000,		/* 75Mbps   - rate 23 */
 	10000000,		/* 80Mbps   - rate 24 */
 	10625000,		/* 85Mbps   - rate 25 */
 	11250000,		/* 90Mbps   - rate 26 */
 	11875000,		/* 95Mbps   - rate 27 */
 	12500000,		/* 100Mbps  - rate 28 */
 	13750000,		/* 110Mbps  - rate 29 */
 	15000000,		/* 120Mbps  - rate 30 */
 	16250000,		/* 130Mbps  - rate 31 */
 	17500000,		/* 140Mbps  - rate 32 */
 	18750000,		/* 150Mbps  - rate 33 */
 	20000000,		/* 160Mbps  - rate 34 */
 	21250000,		/* 170Mbps  - rate 35 */
 	22500000,		/* 180Mbps  - rate 36 */
 	23750000,		/* 190Mbps  - rate 37 */
 	26250000,		/* 210Mbps  - rate 38 */
 	27500000,		/* 220Mbps  - rate 39 */
 	28750000,		/* 230Mbps  - rate 40 */
 	30000000,	       	/* 240Mbps  - rate 41 */
 	31250000,		/* 250Mbps  - rate 42 */
 	34375000,		/* 275Mbps  - rate 43 */
 	37500000,		/* 300Mbps  - rate 44 */
 	40625000,		/* 325Mbps  - rate 45 */
 	43750000,		/* 350Mbps  - rate 46 */
 	46875000,		/* 375Mbps  - rate 47 */
 	53125000,		/* 425Mbps  - rate 48 */
 	56250000,		/* 450Mbps  - rate 49 */
 	59375000,		/* 475Mbps  - rate 50 */
 	62500000,		/* 500Mbps  - rate 51 */
 	68750000,		/* 550Mbps  - rate 52 */
 	75000000,		/* 600Mbps  - rate 53 */
 	81250000,		/* 650Mbps  - rate 54 */
 	87500000,		/* 700Mbps  - rate 55 */
 	93750000,		/* 750Mbps  - rate 56 */
 	106250000,		/* 850Mbps  - rate 57 */
 	112500000,		/* 900Mbps  - rate 58 */
 	125000000,		/* 1Gbps    - rate 59 */
 	156250000,		/* 1.25Gps  - rate 60 */
 	187500000,		/* 1.5Gps   - rate 61 */
 	218750000,		/* 1.75Gps  - rate 62 */
 	250000000,		/* 2Gbps    - rate 63 */
 	281250000,		/* 2.25Gps  - rate 64 */
 	312500000,		/* 2.5Gbps  - rate 65 */
 	343750000,		/* 2.75Gbps - rate 66 */
 	375000000,		/* 3Gbps    - rate 67 */
 	500000000,		/* 4Gbps    - rate 68 */
 	625000000,		/* 5Gbps    - rate 69 */
 	750000000,		/* 6Gbps    - rate 70 */
 	875000000,		/* 7Gbps    - rate 71 */
 	1000000000,		/* 8Gbps    - rate 72 */
 	1125000000,		/* 9Gbps    - rate 73 */
 	1250000000,		/* 10Gbps   - rate 74 */
 	1875000000,		/* 15Gbps   - rate 75 */
 	2500000000		/* 20Gbps   - rate 76 */
 };
 
 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
 #define RS_ORDERED_COUNT 16	/*
 				 * Number that are in order
 				 * at the beginning of the table,
 				 * over this a sort is required.
 				 */
 #define RS_NEXT_ORDER_GROUP 16	/*
 				 * The point in our table where
 				 * we come fill in a second ordered
 				 * group (index wise means -1).
 				 */
 #define ALL_HARDWARE_RATES 1004 /*
 				 * 1Meg - 1Gig in 1 Meg steps
 				 * plus 100, 200k  and 500k and
 				 * 10Gig
 				 */
 
 #define RS_ONE_MEGABIT_PERSEC 1000000
 #define RS_ONE_GIGABIT_PERSEC 1000000000
 #define RS_TEN_GIGABIT_PERSEC 10000000000
 
 static struct head_tcp_rate_set int_rs;
 static struct mtx rs_mtx;
 uint32_t rs_number_alive;
 uint32_t rs_number_dead;
 static uint32_t rs_floor_mss = 0;
 static uint32_t wait_time_floor = 8000;	/* 8 ms */
 static uint32_t rs_hw_floor_mss = 16;
 static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */
 
 static uint32_t mss_divisor = RL_DEFAULT_DIVISOR;
 static uint32_t even_num_segs = 1;
 static uint32_t even_threshold = 4;
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP Ratelimit stats");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
     &rs_number_alive, 0,
     "Number of interfaces initialized for ratelimiting");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
     &rs_number_dead, 0,
     "Number of interfaces departing from ratelimiting");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW,
     &rs_floor_mss, 0,
     "Number of MSS that will override the normal minimums (0 means don't enforce)");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW,
     &wait_time_floor, 2000,
     "Has b/w increases what is the wait floor we are willing to wait at the end?");
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW,
     &num_of_waits_allowed, 1,
     "How many time blocks on the end should software pacing be willing to wait?");
 
 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW,
     &rs_hw_floor_mss, 16,
     "Number of mss that are a minum for hardware pacing?");
 
 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, divisor, CTLFLAG_RW,
     &mss_divisor, RL_DEFAULT_DIVISOR,
     "The value divided into bytes per second to help establish mss size");
 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, even, CTLFLAG_RW,
     &even_num_segs, 1,
     "Do we round mss size up to an even number of segments for delayed ack");
 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, eventhresh, CTLFLAG_RW,
     &even_threshold, 4,
     "At what number of mss do we start rounding up to an even number of mss?");
 
 static void
 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
 {
 	/*
 	 * Add sysctl entries for thus interface.
 	 */
 	if (rs->rs_flags & RS_INTF_NO_SUP) {
 		SYSCTL_ADD_S32(&rs->sysctl_ctx,
 		   SYSCTL_CHILDREN(rl_sysctl_root),
 		   OID_AUTO, "disable", CTLFLAG_RD,
 		   &rs->rs_disable, 0,
 		   "Disable this interface from new hdwr limiting?");
 	} else {
 		SYSCTL_ADD_S32(&rs->sysctl_ctx,
 		   SYSCTL_CHILDREN(rl_sysctl_root),
 		   OID_AUTO, "disable", CTLFLAG_RW,
 		   &rs->rs_disable, 0,
 		   "Disable this interface from new hdwr limiting?");
 	}
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "minseg", CTLFLAG_RW,
 	    &rs->rs_min_seg, 0,
 	    "What is the minimum we need to send on this interface?");
 	SYSCTL_ADD_U64(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "flow_limit", CTLFLAG_RW,
 	    &rs->rs_flow_limit, 0,
 	    "What is the limit for number of flows (0=unlimited)?");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "highest", CTLFLAG_RD,
 	    &rs->rs_highest_valid, 0,
 	    "Highest valid rate");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "lowest", CTLFLAG_RD,
 	    &rs->rs_lowest_valid, 0,
 	    "Lowest valid rate");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "flags", CTLFLAG_RD,
 	    &rs->rs_flags, 0,
 	    "What lags are on the entry?");
 	SYSCTL_ADD_S32(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "numrates", CTLFLAG_RD,
 	    &rs->rs_rate_cnt, 0,
 	    "How many rates re there?");
 	SYSCTL_ADD_U64(&rs->sysctl_ctx,
 	    SYSCTL_CHILDREN(rl_sysctl_root),
 	    OID_AUTO, "flows_using", CTLFLAG_RD,
 	    &rs->rs_flows_using, 0,
 	    "How many flows are using this interface now?");
 #ifdef DETAILED_RATELIMIT_SYSCTL
 	if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
 		/*  Lets display the rates */
 		int i;
 		struct sysctl_oid *rl_rates;
 		struct sysctl_oid *rl_rate_num;
 		char rate_num[16];
 		rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 					    SYSCTL_CHILDREN(rl_sysctl_root),
 					    OID_AUTO,
 					    "rate",
 					    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 					    "Ratelist");
 		for( i = 0; i < rs->rs_rate_cnt; i++) {
 			sprintf(rate_num, "%d", i);
 			rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 					    SYSCTL_CHILDREN(rl_rates),
 					    OID_AUTO,
 					    rate_num,
 					    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 					    "Individual Rate");
 			SYSCTL_ADD_U32(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "flags", CTLFLAG_RD,
 				       &rs->rs_rlt[i].flags, 0,
 				       "Flags on this rate");
 			SYSCTL_ADD_U32(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "pacetime", CTLFLAG_RD,
 				       &rs->rs_rlt[i].time_between, 0,
 				       "Time hardware inserts between 1500 byte sends");
 			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "rate", CTLFLAG_RD,
 				       &rs->rs_rlt[i].rate,
 				       "Rate in bytes per second");
 			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "using", CTLFLAG_RD,
 				       &rs->rs_rlt[i].using,
 				       "Number of flows using");
 			SYSCTL_ADD_LONG(&rs->sysctl_ctx,
 				       SYSCTL_CHILDREN(rl_rate_num),
 				       OID_AUTO, "enobufs", CTLFLAG_RD,
 				       &rs->rs_rlt[i].rs_num_enobufs,
 				       "Number of enobufs logged on this rate");
 
 		}
 	}
 #endif
 }
 
 static void
 rs_destroy(epoch_context_t ctx)
 {
 	struct tcp_rate_set *rs;
 	bool do_free_rs;
 
 	rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
 
 	mtx_lock(&rs_mtx);
 	rs->rs_flags &= ~RS_FUNERAL_SCHD;
 	/*
 	 * In theory its possible (but unlikely)
 	 * that while the delete was occuring
 	 * and we were applying the DEAD flag
 	 * someone slipped in and found the
 	 * interface in a lookup. While we
 	 * decided rs_flows_using were 0 and
 	 * scheduling the epoch_call, the other
 	 * thread incremented rs_flow_using. This
 	 * is because users have a pointer and
 	 * we only use the rs_flows_using in an
 	 * atomic fashion, i.e. the other entities
 	 * are not protected. To assure this did
 	 * not occur, we check rs_flows_using here
 	 * before deleting.
 	 */
 	do_free_rs = (rs->rs_flows_using == 0);
 	rs_number_dead--;
 	mtx_unlock(&rs_mtx);
 
 	if (do_free_rs) {
 		sysctl_ctx_free(&rs->sysctl_ctx);
 		free(rs->rs_rlt, M_TCPPACE);
 		free(rs, M_TCPPACE);
 	}
 }
 
 static void
 rs_defer_destroy(struct tcp_rate_set *rs)
 {
 
 	mtx_assert(&rs_mtx, MA_OWNED);
 
 	/* Check if already pending. */
 	if (rs->rs_flags & RS_FUNERAL_SCHD)
 		return;
 
 	rs_number_dead++;
 
 	/* Set flag to only defer once. */
 	rs->rs_flags |= RS_FUNERAL_SCHD;
 	NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx);
 }
 
 #ifdef INET
 extern counter_u64_t rate_limit_new;
 extern counter_u64_t rate_limit_chg;
 extern counter_u64_t rate_limit_set_ok;
 extern counter_u64_t rate_limit_active;
 extern counter_u64_t rate_limit_alloc_fail;
 #endif
 
 static int
 rl_attach_txrtlmt(struct ifnet *ifp,
     uint32_t flowtype,
     int flowid,
     uint64_t cfg_rate,
     struct m_snd_tag **tag)
 {
 	int error;
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
 		.rate_limit.hdr.flowid = flowid,
 		.rate_limit.hdr.flowtype = flowtype,
 		.rate_limit.max_rate = cfg_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 
 	error = m_snd_tag_alloc(ifp, &params, tag);
 #ifdef INET
 	if (error == 0) {
 		counter_u64_add(rate_limit_set_ok, 1);
 		counter_u64_add(rate_limit_active, 1);
 	} else if (error != EOPNOTSUPP)
 		counter_u64_add(rate_limit_alloc_fail, 1);
 #endif
 	return (error);
 }
 
 static void
 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
 {
 	/*
 	 * The internal table is "special", it
 	 * is two seperate ordered tables that
 	 * must be merged. We get here when the
 	 * adapter specifies a number of rates that
 	 * covers both ranges in the table in some
 	 * form.
 	 */
 	int i, at_low, at_high;
 	uint8_t low_disabled = 0, high_disabled = 0;
 
 	for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
 		rs->rs_rlt[i].flags = 0;
 		rs->rs_rlt[i].time_between = 0;
 		if ((low_disabled == 0) &&
 		    (high_disabled ||
 		     (rate_table_act[at_low] < rate_table_act[at_high]))) {
 			rs->rs_rlt[i].rate = rate_table_act[at_low];
 			at_low++;
 			if (at_low == RS_NEXT_ORDER_GROUP)
 				low_disabled = 1;
 		} else if (high_disabled == 0) {
 			rs->rs_rlt[i].rate = rate_table_act[at_high];
 			at_high++;
 			if (at_high == MAX_HDWR_RATES)
 				high_disabled = 1;
 		}
 	}
 }
 
 static struct tcp_rate_set *
 rt_setup_new_rs(struct ifnet *ifp, int *error)
 {
 	struct tcp_rate_set *rs;
 	const uint64_t *rate_table_act;
 	uint64_t lentim, res;
 	size_t sz;
 	uint32_t hash_type;
 	int i;
 	struct if_ratelimit_query_results rl;
 	struct sysctl_oid *rl_sysctl_root;
 	struct epoch_tracker et;
 	/*
 	 * We expect to enter with the
 	 * mutex locked.
 	 */
 
 	if (ifp->if_ratelimit_query == NULL) {
 		/*
 		 * We can do nothing if we cannot
 		 * get a query back from the driver.
 		 */
 		printf("Warning:No query functions for %s:%d-- failed\n",
 		       ifp->if_dname, ifp->if_dunit);
 		return (NULL);
 	}
 	rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
 	if (rs == NULL) {
 		if (error)
 			*error = ENOMEM;
 		printf("Warning:No memory for malloc of tcp_rate_set\n");
 		return (NULL);
 	}
 	memset(&rl, 0, sizeof(rl));
 	rl.flags = RT_NOSUPPORT;
 	ifp->if_ratelimit_query(ifp, &rl);
 	if (rl.flags & RT_IS_UNUSABLE) {
 		/*
 		 * The interface does not really support
 		 * the rate-limiting.
 		 */
 		memset(rs, 0, sizeof(struct tcp_rate_set));
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_flags = RS_INTF_NO_SUP;
 		rs->rs_disable = 1;
 		rs_number_alive++;
 		sysctl_ctx_init(&rs->sysctl_ctx);
 		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 		    OID_AUTO,
 		    rs->rs_ifp->if_xname,
 		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 		    "");
 		rl_add_syctl_entries(rl_sysctl_root, rs);
 		NET_EPOCH_ENTER(et);
 		mtx_lock(&rs_mtx);
 		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 		mtx_unlock(&rs_mtx);
 		NET_EPOCH_EXIT(et);
 		return (rs);
 	} else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
 		memset(rs, 0, sizeof(struct tcp_rate_set));
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_flags = RS_IS_DEFF;
 		rs_number_alive++;
 		sysctl_ctx_init(&rs->sysctl_ctx);
 		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 		    OID_AUTO,
 		    rs->rs_ifp->if_xname,
 		    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 		    "");
 		rl_add_syctl_entries(rl_sysctl_root, rs);
 		NET_EPOCH_ENTER(et);
 		mtx_lock(&rs_mtx);
 		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 		mtx_unlock(&rs_mtx);
 		NET_EPOCH_EXIT(et);
 		return (rs);
 	} else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
 		/* Mellanox C4 likely */
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_rate_cnt = rl.number_of_rates;
 		rs->rs_min_seg = rl.min_segment_burst;
 		rs->rs_highest_valid = 0;
 		rs->rs_flow_limit = rl.max_flows;
 		rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
 		rs->rs_disable = 0;
 		rate_table_act = rl.rate_table;
 	} else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
 		/* Chelsio, C5 and C6 of Mellanox? */
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_rate_cnt = rl.number_of_rates;
 		rs->rs_min_seg = rl.min_segment_burst;
 		rs->rs_disable = 0;
 		rs->rs_flow_limit = rl.max_flows;
 		rate_table_act = desired_rates;
 		if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
 		    (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
 			/*
 			 * Our desired table is not big
 			 * enough, do what we can.
 			 */
 			rs->rs_rate_cnt = MAX_HDWR_RATES;
 		 }
 		if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
 			rs->rs_flags = RS_IS_INTF;
 		else
 			rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
 		if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
 			rs->rs_rate_cnt = ALL_HARDWARE_RATES;
 	} else {
 		free(rs, M_TCPPACE);
 		return (NULL);
 	}
 	sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
 	rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
 	if (rs->rs_rlt == NULL) {
 		if (error)
 			*error = ENOMEM;
 bail:
 		free(rs, M_TCPPACE);
 		return (NULL);
 	}
 	if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
 		/*
 		 * The interface supports all
 		 * the rates we could possibly want.
 		 */
 		uint64_t rat;
 
 		rs->rs_rlt[0].rate = 12500;	/* 100k */
 		rs->rs_rlt[1].rate = 25000;	/* 200k */
 		rs->rs_rlt[2].rate = 62500;	/* 500k */
 		/* Note 125000 == 1Megabit
 		 * populate 1Meg - 1000meg.
 		 */
 		for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
 			rs->rs_rlt[i].rate = rat;
 			rat += 125000;
 		}
 		rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
 	} else if (rs->rs_flags & RS_INT_TBL) {
 		/* We populate this in a special way */
 		populate_canned_table(rs, rate_table_act);
 	} else {
 		/*
 		 * Just copy in the rates from
 		 * the table, it is in order.
 		 */
 		for (i=0; i<rs->rs_rate_cnt; i++) {
 			rs->rs_rlt[i].rate = rate_table_act[i];
 			rs->rs_rlt[i].time_between = 0;
 			rs->rs_rlt[i].flags = 0;
 		}
 	}
 	for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
 		/*
 		 * We go backwards through the list so that if we can't get
 		 * a rate and fail to init one, we have at least a chance of
 		 * getting the highest one.
 		 */
 		rs->rs_rlt[i].ptbl = rs;
 		rs->rs_rlt[i].tag = NULL;
 		rs->rs_rlt[i].using = 0;
 		rs->rs_rlt[i].rs_num_enobufs = 0;
 		/*
 		 * Calculate the time between.
 		 */
 		lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
 		res = lentim / rs->rs_rlt[i].rate;
 		if (res > 0)
 			rs->rs_rlt[i].time_between = res;
 		else
 			rs->rs_rlt[i].time_between = 1;
 		if (rs->rs_flags & RS_NO_PRE) {
 			rs->rs_rlt[i].flags = HDWRPACE_INITED;
 			rs->rs_lowest_valid = i;
 		} else {
 			int err;
 
 			if ((rl.flags & RT_IS_SETUP_REQ)  &&
 			    (ifp->if_ratelimit_query)) {
 				err = ifp->if_ratelimit_setup(ifp,
   				         rs->rs_rlt[i].rate, i);
 				if (err)
 					goto handle_err;
 			}
 #ifdef RSS
 			hash_type = M_HASHTYPE_RSS_TCP_IPV4;
 #else
 			hash_type = M_HASHTYPE_OPAQUE_HASH;
 #endif
 			err = rl_attach_txrtlmt(ifp,
 			    hash_type,
 			    (i + 1),
 			    rs->rs_rlt[i].rate,
 			    &rs->rs_rlt[i].tag);
 			if (err) {
 handle_err:
 				if (i == (rs->rs_rate_cnt - 1)) {
 					/*
 					 * Huh - first rate and we can't get
 					 * it?
 					 */
 					free(rs->rs_rlt, M_TCPPACE);
 					if (error)
 						*error = err;
 					goto bail;
 				} else {
 					if (error)
 						*error = err;
 				}
 				break;
 			} else {
 				rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
 				rs->rs_lowest_valid = i;
 			}
 		}
 	}
 	/* Did we get at least 1 rate? */
 	if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
 		rs->rs_highest_valid = rs->rs_rate_cnt - 1;
 	else {
 		free(rs->rs_rlt, M_TCPPACE);
 		goto bail;
 	}
 	rs_number_alive++;
 	sysctl_ctx_init(&rs->sysctl_ctx);
 	rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
 	    OID_AUTO,
 	    rs->rs_ifp->if_xname,
 	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	    "");
 	rl_add_syctl_entries(rl_sysctl_root, rs);
 	NET_EPOCH_ENTER(et);
 	mtx_lock(&rs_mtx);
 	CK_LIST_INSERT_HEAD(&int_rs, rs, next);
 	mtx_unlock(&rs_mtx);
 	NET_EPOCH_EXIT(et);
 	return (rs);
 }
 
 /*
  * For an explanation of why the argument is volatile please
  * look at the comments around rt_setup_rate().
  */
 static const struct tcp_hwrate_limit_table *
 tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs,
     uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
 {
 	struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
 	uint64_t mbits_per_sec, ind_calc, previous_rate = 0;
 	int i;
 
 	mbits_per_sec = (bytes_per_sec * 8);
 	if (flags & RS_PACING_LT) {
 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 		    (rs->rs_lowest_valid <= 2)){
 			/*
 			 * Smaller than 1Meg, only
 			 * 3 entries can match it.
 			 */
 			previous_rate = 0;
 			for(i = rs->rs_lowest_valid; i < 3; i++) {
 				if (bytes_per_sec <= rs->rs_rlt[i].rate) {
 					rte = &rs->rs_rlt[i];
 					break;
 				} else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
 					arte = &rs->rs_rlt[i];
 				}
 				previous_rate = rs->rs_rlt[i].rate;
 			}
 			goto done;
 		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
 			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
 			/*
 			 * Larger than 1G (the majority of
 			 * our table.
 			 */
 			if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			else
 				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
 			goto done;
 		}
 		/*
 		 * If we reach here its in our table (between 1Meg - 1000Meg),
 		 * just take the rounded down mbits per second, and add
 		 * 1Megabit to it, from this we can calculate
 		 * the index in the table.
 		 */
 		ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 		if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
 			ind_calc++;
 		/* our table is offset by 3, we add 2 */
 		ind_calc += 2;
 		if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 			/* This should not happen */
 			ind_calc = ALL_HARDWARE_RATES-1;
 		}
 		if ((ind_calc >= rs->rs_lowest_valid) &&
 		    (ind_calc <= rs->rs_highest_valid)) {
 			rte = &rs->rs_rlt[ind_calc];
 			if (ind_calc >= 1)
 				previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
 		}
 	} else if (flags & RS_PACING_EXACT_MATCH) {
 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 		    (rs->rs_lowest_valid <= 2)){
 			for(i = rs->rs_lowest_valid; i < 3; i++) {
 				if (bytes_per_sec == rs->rs_rlt[i].rate) {
 					rte = &rs->rs_rlt[i];
 					break;
 				}
 			}
 		} else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
 			   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
 			/* > 1Gbps only one rate */
 			if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
 				/* Its 10G wow */
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			}
 		} else {
 			/* Ok it must be a exact meg (its between 1G and 1Meg) */
 			ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 			if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
 				/* its an exact Mbps */
 				ind_calc += 2;
 				if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 					/* This should not happen */
 					ind_calc = ALL_HARDWARE_RATES-1;
 				}
 				if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
 					rte = &rs->rs_rlt[ind_calc];
 			}
 		}
 	} else {
 		/* we want greater than the requested rate */
 		if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
 		    (rs->rs_lowest_valid <= 2)){
 			arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
 			for (i=2; i>=rs->rs_lowest_valid; i--) {
 				if (bytes_per_sec < rs->rs_rlt[i].rate) {
 					rte = &rs->rs_rlt[i];
 					if (i >= 1) {
 						previous_rate = rs->rs_rlt[(i-1)].rate;
 					}
 					break;
 				} else if ((flags & RS_PACING_GEQ) &&
 					   (bytes_per_sec == rs->rs_rlt[i].rate)) {
 					rte = &rs->rs_rlt[i];
 					if (i >= 1) {
 						previous_rate = rs->rs_rlt[(i-1)].rate;
 					}
 					break;
 				} else {
 					arte = &rs->rs_rlt[i]; /* new alternate */
 				}
 			}
 		} else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
 			if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
 			    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
 				/* Our top rate is larger than the request */
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			} else if ((flags & RS_PACING_GEQ) &&
 				   (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
 				   (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
 				/* It matches our top rate */
 				rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			} else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
 				/* The top rate is an alternative */
 				arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
 			}
 			previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
 		} else {
 			/* Its in our range 1Meg - 1Gig */
 			if (flags & RS_PACING_GEQ) {
 				ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
 				if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
 					if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 						/* This should not happen */
 						ind_calc = (ALL_HARDWARE_RATES-1);
 					}
 					rte = &rs->rs_rlt[ind_calc];
 					if (ind_calc >= 1)
 						previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
 				}
 				goto done;
 			}
 			ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
 			ind_calc += 2;
 			if (ind_calc > (ALL_HARDWARE_RATES-1)) {
 				/* This should not happen */
 				ind_calc = ALL_HARDWARE_RATES-1;
 			}
 			if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) {
 				rte = &rs->rs_rlt[ind_calc];
 				if (ind_calc >= 1)
 					previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
 			}
 		}
 	}
 done:
 	if ((rte == NULL) &&
 	    (arte != NULL) &&
 	    (flags & RS_PACING_SUB_OK)) {
 		/* We can use the substitute */
 		rte = arte;
 	}
 	if (lower_rate)
 		*lower_rate = previous_rate;
 	return (rte);
 }
 
 /*
  * For an explanation of why the argument is volatile please
  * look at the comments around rt_setup_rate().
  */
 static const struct tcp_hwrate_limit_table *
 tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
 {
 	/**
 	 * Hunt the rate table with the restrictions in flags and find a
 	 * suitable rate if possible.
 	 * RS_PACING_EXACT_MATCH - look for an exact match to rate.
 	 * RS_PACING_GT     - must be greater than.
 	 * RS_PACING_GEQ    - must be greater than or equal.
 	 * RS_PACING_LT     - must be less than.
 	 * RS_PACING_SUB_OK - If we don't meet criteria a
 	 *                    substitute is ok.
 	 */
 	int i, matched;
 	struct tcp_hwrate_limit_table *rte = NULL;
 	uint64_t previous_rate = 0;
 
 	if ((rs->rs_flags & RS_INT_TBL) &&
 	    (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
 		/*
 		 * Here we don't want to paw thru
 		 * a big table, we have everything
 		 * from 1Meg - 1000Meg in 1Meg increments.
 		 * Use an alternate method to "lookup".
 		 */
 		return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate));
 	}
 	if ((flags & RS_PACING_LT) ||
 	    (flags & RS_PACING_EXACT_MATCH)) {
 		/*
 		 * For exact and less than we go forward through the table.
 		 * This way when we find one larger we stop (exact was a
 		 * toss up).
 		 */
 		for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
 			if ((flags & RS_PACING_EXACT_MATCH) &&
 			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
 				rte = &rs->rs_rlt[i];
 				matched = 1;
 				if (lower_rate != NULL)
 					*lower_rate = previous_rate;
 				break;
 			} else if ((flags & RS_PACING_LT) &&
 			    (bytes_per_sec <= rs->rs_rlt[i].rate)) {
 				rte = &rs->rs_rlt[i];
 				matched = 1;
 				if (lower_rate != NULL)
 					*lower_rate = previous_rate;
 				break;
 			}
 			previous_rate = rs->rs_rlt[i].rate;
 			if (bytes_per_sec > rs->rs_rlt[i].rate)
 				break;
 		}
 		if ((matched == 0) &&
 		    (flags & RS_PACING_LT) &&
 		    (flags & RS_PACING_SUB_OK)) {
 			/* Kick in a substitute (the lowest) */
 			rte = &rs->rs_rlt[rs->rs_lowest_valid];
 		}
 	} else {
 		/*
 		 * Here we go backward through the table so that we can find
 		 * the one greater in theory faster (but its probably a
 		 * wash).
 		 */
 		for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
 			if (rs->rs_rlt[i].rate > bytes_per_sec) {
 				/* A possible candidate */
 				rte = &rs->rs_rlt[i];
 			}
 			if ((flags & RS_PACING_GEQ) &&
 			    (bytes_per_sec == rs->rs_rlt[i].rate)) {
 				/* An exact match and we want equal */
 				matched = 1;
 				rte = &rs->rs_rlt[i];
 				break;
 			} else if (rte) {
 				/*
 				 * Found one that is larger than but don't
 				 * stop, there may be a more closer match.
 				 */
 				matched = 1;
 			}
 			if (rs->rs_rlt[i].rate < bytes_per_sec) {
 				/*
 				 * We found a table entry that is smaller,
 				 * stop there will be none greater or equal.
 				 */
 				if (lower_rate != NULL)
 					*lower_rate = rs->rs_rlt[i].rate;
 				break;
 			}
 		}
 		if ((matched == 0) &&
 		    (flags & RS_PACING_SUB_OK)) {
 			/* Kick in a substitute (the highest) */
 			rte = &rs->rs_rlt[rs->rs_highest_valid];
 		}
 	}
 	return (rte);
 }
 
 static struct ifnet *
 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
 {
 	struct ifnet *tifp;
 	struct m_snd_tag *tag, *ntag;
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
 		.rate_limit.hdr.flowid = inp->inp_flowid,
 		.rate_limit.hdr.numa_domain = inp->inp_numa_domain,
 		.rate_limit.max_rate = COMMON_RATE,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	int err;
 #ifdef RSS
 	params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
 	    M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
 #else
 	params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
 #endif
 	err = m_snd_tag_alloc(ifp, &params, &tag);
 	if (err) {
 		/* Failed to setup a tag? */
 		if (error)
 			*error = err;
 		return (NULL);
 	}
 	ntag = tag;
 	while (ntag->sw->next_snd_tag != NULL) {
 		ntag = ntag->sw->next_snd_tag(ntag);
 	}
 	tifp = ntag->ifp;
 	m_snd_tag_rele(tag);
 	return (tifp);
 }
 
 static void
 rl_increment_using(const struct tcp_hwrate_limit_table *rte)
 {
 	struct tcp_hwrate_limit_table *decon_rte;
 
 	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
 	atomic_add_long(&decon_rte->using, 1);
 }
 
 static void
 rl_decrement_using(const struct tcp_hwrate_limit_table *rte)
 {
 	struct tcp_hwrate_limit_table *decon_rte;
 
 	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
 	atomic_subtract_long(&decon_rte->using, 1);
 }
 
 void
 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
 {
 	struct tcp_hwrate_limit_table *decon_rte;
 
 	decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
 	atomic_add_long(&decon_rte->rs_num_enobufs, 1);
 }
 
 /*
  * Do NOT take the __noinline out of the
  * find_rs_for_ifp() function. If you do the inline
  * of it for the rt_setup_rate() will show you a
  * compiler bug. For some reason the compiler thinks
  * the list can never be empty. The consequence of
  * this will be a crash when we dereference NULL
  * if an ifp is removed just has a hw rate limit
  * is attempted. If you are working on the compiler
  * and want to "test" this go ahead and take the noinline
  * out otherwise let sleeping dogs ly until such time
  * as we get a compiler fix 10/2/20 -- RRS
  */
 static __noinline struct tcp_rate_set *
 find_rs_for_ifp(struct ifnet *ifp)
 {
 	struct tcp_rate_set *rs;
 
 	CK_LIST_FOREACH(rs, &int_rs, next) {
 		if ((rs->rs_ifp == ifp) &&
 		    (rs->rs_if_dunit == ifp->if_dunit)) {
 			/* Ok we found it */
 			return (rs);
 		}
 	}
 	return (NULL);
 }
 
 
 static const struct tcp_hwrate_limit_table *
 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
     uint32_t flags, int *error, uint64_t *lower_rate)
 {
 	/* First lets find the interface if it exists */
 	const struct tcp_hwrate_limit_table *rte;
 	/*
 	 * So why is rs volatile? This is to defeat a
 	 * compiler bug where in the compiler is convinced
 	 * that rs can never be NULL (which is not true). Because
 	 * of its conviction it nicely optimizes out the if ((rs == NULL
 	 * below which means if you get a NULL back you dereference it.
 	 */
 	volatile struct tcp_rate_set *rs;
 	struct epoch_tracker et;
 	struct ifnet *oifp = ifp;
 	int err;
 
 	NET_EPOCH_ENTER(et);
 use_real_interface:
 	rs = find_rs_for_ifp(ifp);
 	if ((rs == NULL) ||
 	    (rs->rs_flags & RS_INTF_NO_SUP) ||
 	    (rs->rs_flags & RS_IS_DEAD)) {
 		/*
 		 * This means we got a packet *before*
 		 * the IF-UP was processed below, <or>
 		 * while or after we already received an interface
 		 * departed event. In either case we really don't
 		 * want to do anything with pacing, in
 		 * the departing case the packet is not
 		 * going to go very far. The new case
 		 * might be arguable, but its impossible
 		 * to tell from the departing case.
 		 */
 		if (error)
 			*error = ENODEV;
 		NET_EPOCH_EXIT(et);
 		return (NULL);
 	}
 
 	if ((rs == NULL) || (rs->rs_disable != 0)) {
 		if (error)
 			*error = ENOSPC;
 		NET_EPOCH_EXIT(et);
 		return (NULL);
 	}
 	if (rs->rs_flags & RS_IS_DEFF) {
 		/* We need to find the real interface */
 		struct ifnet *tifp;
 
 		tifp = rt_find_real_interface(ifp, inp, error);
 		if (tifp == NULL) {
 			if (rs->rs_disable && error)
 				*error = ENOTSUP;
 			NET_EPOCH_EXIT(et);
 			return (NULL);
 		}
 		KASSERT((tifp != ifp),
 			("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n",
 			 ifp, inp, tifp));
 		ifp = tifp;
 		goto use_real_interface;
 	}
 	if (rs->rs_flow_limit &&
 	    ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
 		if (error)
 			*error = ENOSPC;
 		NET_EPOCH_EXIT(et);
 		return (NULL);
 	}
 	rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
 	if (rte) {
 		err = in_pcbattach_txrtlmt(inp, oifp,
 		    inp->inp_flowtype,
 		    inp->inp_flowid,
 		    rte->rate,
 		    &inp->inp_snd_tag);
 		if (err) {
 			/* Failed to attach */
 			if (error)
 				*error = err;
 			rte = NULL;
 		} else {
 			KASSERT((inp->inp_snd_tag != NULL) ,
 				("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p",
 				 inp, rte, (unsigned long long)rte->rate, rs));
 #ifdef INET
 			counter_u64_add(rate_limit_new, 1);
 #endif
 		}
 	}
 	if (rte) {
 		/*
 		 * We use an atomic here for accounting so we don't have to
 		 * use locks when freeing.
 		 */
 		atomic_add_64(&rs->rs_flows_using, 1);
 	}
 	NET_EPOCH_EXIT(et);
 	return (rte);
 }
 
 static void
 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
 {
 	int error;
 	struct tcp_rate_set *rs;
 	struct epoch_tracker et;
 
 	if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) ||
 	    (link_state != LINK_STATE_UP)) {
 		/*
 		 * We only care on an interface going up that is rate-limit
 		 * capable.
 		 */
 		return;
 	}
 	NET_EPOCH_ENTER(et);
 	mtx_lock(&rs_mtx);
 	rs = find_rs_for_ifp(ifp);
 	if (rs) {
 		/* We already have initialized this guy */
 		mtx_unlock(&rs_mtx);
 		NET_EPOCH_EXIT(et);
 		return;
 	}
 	mtx_unlock(&rs_mtx);
 	NET_EPOCH_EXIT(et);
 	rt_setup_new_rs(ifp, &error);
 }
 
 static void
 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
 {
 	struct tcp_rate_set *rs;
 	struct epoch_tracker et;
 	int i;
 
 	NET_EPOCH_ENTER(et);
 	mtx_lock(&rs_mtx);
 	rs = find_rs_for_ifp(ifp);
 	if (rs) {
 		CK_LIST_REMOVE(rs, next);
 		rs_number_alive--;
 		rs->rs_flags |= RS_IS_DEAD;
 		for (i = 0; i < rs->rs_rate_cnt; i++) {
 			if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
 				in_pcbdetach_tag(rs->rs_rlt[i].tag);
 				rs->rs_rlt[i].tag = NULL;
 			}
 			rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
 		}
 		if (rs->rs_flows_using == 0)
 			rs_defer_destroy(rs);
 	}
 	mtx_unlock(&rs_mtx);
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 tcp_rl_shutdown(void *arg __unused, int howto __unused)
 {
 	struct tcp_rate_set *rs, *nrs;
 	struct epoch_tracker et;
 	int i;
 
 	NET_EPOCH_ENTER(et);
 	mtx_lock(&rs_mtx);
 	CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
 		CK_LIST_REMOVE(rs, next);
 		rs_number_alive--;
 		rs->rs_flags |= RS_IS_DEAD;
 		for (i = 0; i < rs->rs_rate_cnt; i++) {
 			if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
 				in_pcbdetach_tag(rs->rs_rlt[i].tag);
 				rs->rs_rlt[i].tag = NULL;
 			}
 			rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
 		}
 		if (rs->rs_flows_using == 0)
 			rs_defer_destroy(rs);
 	}
 	mtx_unlock(&rs_mtx);
 	NET_EPOCH_EXIT(et);
 }
 
 const struct tcp_hwrate_limit_table *
 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 	const struct tcp_hwrate_limit_table *rte;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (inp->inp_snd_tag == NULL) {
 		/*
 		 * We are setting up a rate for the first time.
 		 */
 		if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) {
 			/* Not supported by the egress */
 			if (error)
 				*error = ENODEV;
 			return (NULL);
 		}
 #ifdef KERN_TLS
 		tls = NULL;
 		if (tptosocket(tp)->so_snd.sb_flags & SB_TLS_IFNET) {
 			tls = tptosocket(tp)->so_snd.sb_tls_info;
 
 			if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 ||
 			    tls->mode != TCP_TLS_MODE_IFNET) {
 				if (error)
 					*error = ENODEV;
 				return (NULL);
 			}
 		}
 #endif
 		rte = rt_setup_rate(inp, ifp, bytes_per_sec, flags, error, lower_rate);
 		if (rte)
 			rl_increment_using(rte);
 #ifdef KERN_TLS
 		if (rte != NULL && tls != NULL && tls->snd_tag != NULL) {
 			/*
 			 * Fake a route change error to reset the TLS
 			 * send tag.  This will convert the existing
 			 * tag to a TLS ratelimit tag.
 			 */
 			MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS);
 			ktls_output_eagain(inp, tls);
 		}
 #endif
 	} else {
 		/*
 		 * We are modifying a rate, wrong interface?
 		 */
 		if (error)
 			*error = EINVAL;
 		rte = NULL;
 	}
 	if (rte != NULL) {
 		tp->t_pacing_rate = rte->rate;
 		*error = 0;
 	}
 	return (rte);
 }
 
 const struct tcp_hwrate_limit_table *
 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
     struct tcpcb *tp, struct ifnet *ifp,
     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 	const struct tcp_hwrate_limit_table *nrte;
 	const struct tcp_rate_set *rs;
 #ifdef KERN_TLS
 	struct ktls_session *tls = NULL;
 #endif
 	int err;
 
 	INP_WLOCK_ASSERT(inp);
 
 	if (crte == NULL) {
 		/* Wrong interface */
 		if (error)
 			*error = EINVAL;
 		return (NULL);
 	}
 
 #ifdef KERN_TLS
 	if (tptosocket(tp)->so_snd.sb_flags & SB_TLS_IFNET) {
 		tls = tptosocket(tp)->so_snd.sb_tls_info;
 		if (tls->mode != TCP_TLS_MODE_IFNET)
 			tls = NULL;
 		else if (tls->snd_tag != NULL &&
 		    tls->snd_tag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) {
 			if (!tls->reset_pending) {
 				/*
 				 * NIC probably doesn't support
 				 * ratelimit TLS tags if it didn't
 				 * allocate one when an existing rate
 				 * was present, so ignore.
 				 */
 				tcp_rel_pacing_rate(crte, tp);
 				if (error)
 					*error = EOPNOTSUPP;
 				return (NULL);
 			}
 
 			/*
 			 * The send tag is being converted, so set the
 			 * rate limit on the inpcb tag.  There is a
 			 * race that the new NIC send tag might use
 			 * the current rate instead of this one.
 			 */
 			tls = NULL;
 		}
 	}
 #endif
 	if (inp->inp_snd_tag == NULL) {
 		/* Wrong interface */
 		tcp_rel_pacing_rate(crte, tp);
 		if (error)
 			*error = EINVAL;
 		return (NULL);
 	}
 	rs = crte->ptbl;
 	if ((rs->rs_flags & RS_IS_DEAD) ||
 	    (crte->flags & HDWRPACE_IFPDEPARTED)) {
 		/* Release the rate, and try anew */
 
 		tcp_rel_pacing_rate(crte, tp);
 		nrte = tcp_set_pacing_rate(tp, ifp,
 		    bytes_per_sec, flags, error, lower_rate);
 		return (nrte);
 	}
 	nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
 	if (nrte == crte) {
 		/* No change */
 		if (error)
 			*error = 0;
 		return (crte);
 	}
 	if (nrte == NULL) {
 		/* Release the old rate */
 		if (error)
 			*error = ENOENT;
 		tcp_rel_pacing_rate(crte, tp);
 		return (NULL);
 	}
 	rl_decrement_using(crte);
 	rl_increment_using(nrte);
 	/* Change rates to our new entry */
 #ifdef KERN_TLS
 	if (tls != NULL)
 		err = ktls_modify_txrtlmt(tls, nrte->rate);
 	else
 #endif
 		err = in_pcbmodify_txrtlmt(inp, nrte->rate);
 	if (err) {
 		struct tcp_rate_set *lrs;
 		uint64_t pre;
 
 		rl_decrement_using(nrte);
 		lrs = __DECONST(struct tcp_rate_set *, rs);
 		pre = atomic_fetchadd_64(&lrs->rs_flows_using, -1);
 		/* Do we still have a snd-tag attached? */
 		if (inp->inp_snd_tag)
 			in_pcbdetach_txrtlmt(inp);
 
 		if (pre == 1) {
 			struct epoch_tracker et;
 
 			NET_EPOCH_ENTER(et);
 			mtx_lock(&rs_mtx);
 			/*
 			 * Is it dead?
 			 */
 			if (lrs->rs_flags & RS_IS_DEAD)
 				rs_defer_destroy(lrs);
 			mtx_unlock(&rs_mtx);
 			NET_EPOCH_EXIT(et);
 		}
 		if (error)
 			*error = err;
 		return (NULL);
 	} else {
 #ifdef INET
 		counter_u64_add(rate_limit_chg, 1);
 #endif
 	}
 	if (error)
 		*error = 0;
 	tp->t_pacing_rate = nrte->rate;
 	return (nrte);
 }
 
 void
 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 	const struct tcp_rate_set *crs;
 	struct tcp_rate_set *rs;
 	uint64_t pre;
 
 	INP_WLOCK_ASSERT(inp);
 
 	tp->t_pacing_rate = -1;
 	crs = crte->ptbl;
 	/*
 	 * Now we must break the const
 	 * in order to release our refcount.
 	 */
 	rs = __DECONST(struct tcp_rate_set *, crs);
 	rl_decrement_using(crte);
 	pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
 	if (pre == 1) {
 		struct epoch_tracker et;
 
 		NET_EPOCH_ENTER(et);
 		mtx_lock(&rs_mtx);
 		/*
 		 * Is it dead?
 		 */
 		if (rs->rs_flags & RS_IS_DEAD)
 			rs_defer_destroy(rs);
 		mtx_unlock(&rs_mtx);
 		NET_EPOCH_EXIT(et);
 	}
 
 	/*
 	 * XXX: If this connection is using ifnet TLS, should we
 	 * switch it to using an unlimited rate, or perhaps use
 	 * ktls_output_eagain() to reset the send tag to a plain
 	 * TLS tag?
 	 */
 	in_pcbdetach_txrtlmt(inp);
 }
 
 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
 #define ONE_HUNDRED_MBPS 12500000	/* 100Mbps in bytes per second */
 #define FIVE_HUNDRED_MBPS 62500000	/* 500Mbps in bytes per second */
 #define MAX_MSS_SENT 43	/* 43 mss = 43 x 1500 = 64,500 bytes */
 
 static void
 tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso,
 		    uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between,
 		    uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log, 0, sizeof(log));
 		log.u_bbr.flex1 = segsiz;
 		log.u_bbr.flex2 = new_tso;
 		log.u_bbr.flex3 = time_between;
 		log.u_bbr.flex4 = calc_time_between;
 		log.u_bbr.flex5 = segs;
 		log.u_bbr.flex6 = res_div;
 		log.u_bbr.flex7 = mult;
 		log.u_bbr.flex8 = mod;
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.cur_del_rate = bw;
 		log.u_bbr.delRate = hw_rate;
 		TCP_LOG_EVENTP(tp, NULL,
 		    &tptosocket(tp)->so_rcv,
 		    &tptosocket(tp)->so_snd,
 		    TCP_HDWR_PACE_SIZE, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 uint32_t
 tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
    const struct tcp_hwrate_limit_table *te, int *err, int divisor)
 {
 	/*
 	 * We use the google formula to calculate the
 	 * TSO size. I.E.
 	 * bw < 24Meg
 	 *   tso = 2mss
 	 * else
 	 *   tso = min(bw/(div=1000), 64k)
 	 *
 	 * Note for these calculations we ignore the
 	 * packet overhead (enet hdr, ip hdr and tcp hdr).
 	 * We only get the google formula when we have
 	 * divisor = 1000, which is the default for now.
 	 */
 	uint64_t lentim, res, bytes;
 	uint32_t new_tso, min_tso_segs;
 
 	/* It can't be zero */
 	if ((divisor == 0) ||
 	    (divisor < RL_MIN_DIVISOR)) {
 		if (mss_divisor)
 			bytes = bw / mss_divisor;
 		else
 			bytes = bw / 1000;
 	} else
 		bytes = bw / divisor;
 	/* We can't ever send more than 65k in a TSO */
 	if (bytes > 0xffff) {
 		bytes = 0xffff;
 	}
 	/* Round up */
 	new_tso = (bytes + segsiz - 1) / segsiz;
 	/* Are we enforcing even boundaries? */
 	if (even_num_segs && (new_tso & 1) && (new_tso > even_threshold))
 		new_tso++;
 	if (can_use_1mss)
 		min_tso_segs = 1;
 	else
 		min_tso_segs = 2;
 	if (rs_floor_mss && (new_tso < rs_floor_mss))
 		new_tso = rs_floor_mss;
 	else if (new_tso < min_tso_segs)
 		new_tso = min_tso_segs;
 	if (new_tso > MAX_MSS_SENT)
 		new_tso = MAX_MSS_SENT;
 	new_tso *= segsiz;
  	tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 			    0, 0, 0, 0, 0, 0, 1);
 	/*
 	 * If we are not doing hardware pacing
 	 * then we are done.
 	 */
 	if (te == NULL) {
 		if (err)
 			*err = 0;
 		return(new_tso);
 	}
 	/*
 	 * For hardware pacing we look at the
 	 * rate you are sending at and compare
 	 * that to the rate you have in hardware.
 	 *
 	 * If the hardware rate is slower than your
 	 * software rate then you are in error and
 	 * we will build a queue in our hardware whic
 	 * is probably not desired, in such a case
 	 * just return the non-hardware TSO size.
 	 *
 	 * If the rate in hardware is faster (which
 	 * it should be) then look at how long it
 	 * takes to send one ethernet segment size at
 	 * your b/w and compare that to the time it
 	 * takes to send at the rate you had selected.
 	 *
 	 * If your time is greater (which we hope it is)
 	 * we get the delta between the two, and then
 	 * divide that into your pacing time. This tells
 	 * us how many MSS you can send down at once (rounded up).
 	 *
 	 * Note we also double this value if the b/w is over
 	 * 100Mbps. If its over 500meg we just set you to the
 	 * max (43 segments).
 	 */
 	if (te->rate > FIVE_HUNDRED_MBPS)
 		goto max;
 	if (te->rate == bw) {
 		/* We are pacing at exactly the hdwr rate */
 max:
 		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 				    te->rate, te->time_between, (uint32_t)0,
 				    (segsiz * MAX_MSS_SENT), 0, 0, 3);
 		return (segsiz * MAX_MSS_SENT);
 	}
 	lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
 	res = lentim / bw;
 	if (res > te->time_between) {
 		uint32_t delta, segs, res_div;
 
 		res_div = ((res * num_of_waits_allowed) + wait_time_floor);
 		delta = res - te->time_between;
 		segs = (res_div + delta - 1)/delta;
 		if (segs < min_tso_segs)
 			segs = min_tso_segs;
 		if (segs < rs_hw_floor_mss)
 			segs = rs_hw_floor_mss;
 		if (segs > MAX_MSS_SENT)
 			segs = MAX_MSS_SENT;
 		segs *= segsiz;
 		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 				    te->rate, te->time_between, (uint32_t)res,
 				    segs, res_div, 1, 3);
 		if (err)
 			*err = 0;
 		if (segs < new_tso) {
 			/* unexpected ? */
 			return(new_tso);
 		} else {
 			return (segs);
 		}
 	} else {
 		/*
 		 * Your time is smaller which means
 		 * we will grow a queue on our
 		 * hardware. Send back the non-hardware
 		 * rate.
 		 */
 		tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 				    te->rate, te->time_between, (uint32_t)res,
 				    0, 0, 0, 4);
 		if (err)
 			*err = -1;
 		return (new_tso);
 	}
 }
 
 uint64_t
 tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp)
 {
 	struct epoch_tracker et;
 	struct tcp_rate_set *rs;
 	uint64_t rate_ret;
 
 	NET_EPOCH_ENTER(et);
 use_next_interface:
 	rs = find_rs_for_ifp(ifp);
 	if (rs == NULL) {
 		/* This interface does not do ratelimiting */
 		rate_ret = 0;
 	} else if (rs->rs_flags & RS_IS_DEFF) {
 		/* We need to find the real interface */
 		struct ifnet *tifp;
 
 		tifp = rt_find_real_interface(ifp, inp, NULL);
 		if (tifp == NULL) {
 			NET_EPOCH_EXIT(et);
 			return (0);
 		}
 		ifp = tifp;
 		goto use_next_interface;
 	} else {
 		/* Lets return the highest rate this guy has */
 		rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate;
 	}
 	NET_EPOCH_EXIT(et);
 	return(rate_ret);
 }
 
 static eventhandler_tag rl_ifnet_departs;
 static eventhandler_tag rl_ifnet_arrives;
 static eventhandler_tag rl_shutdown_start;
 
 static void
 tcp_rs_init(void *st __unused)
 {
 	CK_LIST_INIT(&int_rs);
 	rs_number_alive = 0;
 	rs_number_dead = 0;
 	mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
 	rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
 	    tcp_rl_ifnet_departure,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
 	    tcp_rl_ifnet_link,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
 	    tcp_rl_shutdown, NULL,
 	    SHUTDOWN_PRI_FIRST);
 	printf("TCP_ratelimit: Is now initialized\n");
 }
 
 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
 #endif
diff --git a/sys/netinet/tcp_stacks/rack_bbr_common.c b/sys/netinet/tcp_stacks/rack_bbr_common.c
index d684132f80c3..83b3d74b8dcc 100644
--- a/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ b/sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -1,1062 +1,1063 @@
 /*-
  * Copyright (c) 2016-2020 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 /*
  * Author: Randall Stewart <rrs@netflix.com>
  * This work is based on the ACM Queue paper
  * BBR - Congestion Based Congestion Control
  * and also numerous discussions with Neal, Yuchung and Van.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_ratelimit.h"
 #include "opt_kern_tls.h"
 #include <sys/param.h>
 #include <sys/arb.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
 #endif
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/qmath.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #ifdef KERN_TLS
 #include <sys/ktls.h>
 #endif
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/tree.h>
 #ifdef NETFLIX_STATS
 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
 #endif
 #include <sys/refcount.h>
 #include <sys/queue.h>
 #include <sys/smp.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/tim_filter.h>
 #include <sys/time.h>
 #include <vm/uma.h>
 #include <sys/kern_prefetch.h>
 
 #include <net/route.h>
 #include <net/vnet.h>
 #include <net/ethernet.h>
 #include <net/bpf.h>
 
 #define TCPSTATES		/* for logging */
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>	/* required for icmp_var.h */
 #include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_ecn.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/tcp_lro.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_log_buf.h>
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcp_fastopen.h>
 
 #include <netipsec/ipsec_support.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 #include <netipsec/ipsec.h>
 #include <netipsec/ipsec6.h>
 #endif				/* IPSEC */
 
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <machine/in_cksum.h>
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 #include "rack_bbr_common.h"
 
 /*
  * Common TCP Functions - These are shared by borth
  * rack and BBR.
  */
 #ifdef KERN_TLS
 uint32_t
 ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
 {
 	struct ktls_session *tls;
 	uint32_t len;
 
 again:
 	tls = so->so_snd.sb_tls_info;
 	len = tls->params.max_frame_len;         /* max tls payload */
 	len += tls->params.tls_hlen;      /* tls header len  */
 	len += tls->params.tls_tlen;      /* tls trailer len */
 	if ((len * 4) > rwnd) {
 		/*
 		 * Stroke this will suck counter and what
 		 * else should we do Drew? From the
 		 * TCP perspective I am not sure
 		 * what should be done...
 		 */
 		if (tls->params.max_frame_len > 4096) {
 			tls->params.max_frame_len -= 4096;
 			if (tls->params.max_frame_len < 4096)
 				tls->params.max_frame_len = 4096;
 			goto again;
 		}
 	}
 	return (len);
 }
 #endif
 
 static int
 ctf_get_enet_type(struct ifnet *ifp, struct mbuf *m)
 {
 	struct ether_header *eh;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
 #endif
 #ifdef INET
 	struct ip *ip = NULL;		/* Keep compiler happy. */
 #endif
 #if defined(INET) || defined(INET6)
 	struct tcphdr *th;
 	int32_t tlen;
 	uint16_t drop_hdrlen;
 #endif
 	uint16_t etype;
 #ifdef INET
 	uint8_t iptos;
 #endif
 
 	/* Is it the easy way? */
 	if (m->m_flags & M_LRO_EHDRSTRP)
 		return (m->m_pkthdr.lro_etype);
 	/*
 	 * Ok this is the old style call, the ethernet header is here.
 	 * This also means no checksum or BPF were done. This
 	 * can happen if the race to setup the inp fails and
 	 * LRO sees no INP at packet input, but by the time
 	 * we queue the packets an INP gets there. Its rare
 	 * but it can occur so we will handle it. Note that
 	 * this means duplicated work but with the rarity of it
 	 * its not worth worrying about.
 	 */
 	/* Let the BPF see the packet */
 	if (bpf_peers_present(ifp->if_bpf))
 		ETHER_BPF_MTAP(ifp, m);
 	/* Now the csum */
 	eh = mtod(m, struct ether_header *);
 	etype = ntohs(eh->ether_type);
 	m_adj(m,  sizeof(*eh));
 	switch (etype) {
 #ifdef INET6
 		case ETHERTYPE_IPV6:
 		{
 			if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
 				m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
 				if (m == NULL) {
 					KMOD_TCPSTAT_INC(tcps_rcvshort);
 					return (-1);
 				}
 			}
 			ip6 = (struct ip6_hdr *)(eh + 1);
 			th = (struct tcphdr *)(ip6 + 1);
 			drop_hdrlen = sizeof(*ip6);
 			tlen = ntohs(ip6->ip6_plen);
 			if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
 				if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 					th->th_sum = m->m_pkthdr.csum_data;
 				else
 					th->th_sum = in6_cksum_pseudo(ip6, tlen,
 								      IPPROTO_TCP,
 								      m->m_pkthdr.csum_data);
 				th->th_sum ^= 0xffff;
 			} else
 				th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
 			if (th->th_sum) {
 				KMOD_TCPSTAT_INC(tcps_rcvbadsum);
 				m_freem(m);
 				return (-1);
 			}
 			return (etype);
 		}
 #endif
 #ifdef INET
 		case ETHERTYPE_IP:
 		{
 			if (m->m_len < sizeof (struct tcpiphdr)) {
 				m = m_pullup(m, sizeof (struct tcpiphdr));
 				if (m == NULL) {
 					KMOD_TCPSTAT_INC(tcps_rcvshort);
 					return (-1);
 				}
 			}
 			ip = (struct ip *)(eh + 1);
 			th = (struct tcphdr *)(ip + 1);
 			drop_hdrlen = sizeof(*ip);
 			iptos = ip->ip_tos;
 			tlen = ntohs(ip->ip_len) - sizeof(struct ip);
 			if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 				if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 					th->th_sum = m->m_pkthdr.csum_data;
 				else
 					th->th_sum = in_pseudo(ip->ip_src.s_addr,
 							       ip->ip_dst.s_addr,
 							       htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP));
 				th->th_sum ^= 0xffff;
 			} else {
 				int len;
 				struct ipovly *ipov = (struct ipovly *)ip;
 				/*
 				 * Checksum extended TCP header and data.
 				 */
 				len = drop_hdrlen + tlen;
 				bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
 				ipov->ih_len = htons(tlen);
 				th->th_sum = in_cksum(m, len);
 				/* Reset length for SDT probes. */
 				ip->ip_len = htons(len);
 				/* Reset TOS bits */
 				ip->ip_tos = iptos;
 				/* Re-initialization for later version check */
 				ip->ip_v = IPVERSION;
 				ip->ip_hl = sizeof(*ip) >> 2;
 			}
 			if (th->th_sum) {
 				KMOD_TCPSTAT_INC(tcps_rcvbadsum);
 				m_freem(m);
 				return (-1);
 			}
 			break;
 		}
 #endif
 	};
 	return (etype);
 }
 
 /*
  * The function ctf_process_inbound_raw() is used by
  * transport developers to do the steps needed to
  * support MBUF Queuing i.e. the flags in
  * inp->inp_flags2:
  *
  * - INP_SUPPORTS_MBUFQ
  * - INP_MBUF_QUEUE_READY
  * - INP_DONT_SACK_QUEUE
  * - INP_MBUF_ACKCMP
  *
  * These flags help control how LRO will deliver
  * packets to the transport. You first set in inp_flags2
  * the INP_SUPPORTS_MBUFQ to tell the LRO code that you
  * will gladly take a queue of packets instead of a compressed
  * single packet. You also set in your t_fb pointer the
  * tfb_do_queued_segments to point to ctf_process_inbound_raw.
  *
  * This then gets you lists of inbound ACK's/Data instead
  * of a condensed compressed ACK/DATA packet. Why would you
  * want that? This will get you access to all the arrival
  * times of at least LRO and possibly at the Hardware (if
  * the interface card supports that) of the actual ACK/DATA.
  * In some transport designs this is important since knowing
  * the actual time we got the packet is useful information.
  *
  * A new special type of mbuf may also be supported by the transport
  * if it has set the INP_MBUF_ACKCMP flag. If its set, LRO will
  * possibly create a M_ACKCMP type mbuf. This is a mbuf with
  * an array of "acks". One thing also to note is that when this
  * occurs a subsequent LRO may find at the back of the untouched
  * mbuf queue chain a M_ACKCMP and append on to it. This means
  * that until the transport pulls in the mbuf chain queued
  * for it more ack's may get on the mbufs that were already
  * delivered. There currently is a limit of 6 acks condensed
  * into 1 mbuf which means often when this is occuring, we
  * don't get that effect but it does happen.
  *
  * Now there are some interesting Caveats that the transport
  * designer needs to take into account when using this feature.
  *
  * 1) It is used with HPTS and pacing, when the pacing timer
  *    for output calls it will first call the input.
  * 2) When you set INP_MBUF_QUEUE_READY this tells LRO
  *    queue normal packets, I am busy pacing out data and
  *    will process the queued packets before my tfb_tcp_output
  *    call from pacing. If a non-normal packet arrives, (e.g. sack)
  *    you will be awoken immediately.
  * 3) Finally you can add the INP_DONT_SACK_QUEUE to not even
  *    be awoken if a SACK has arrived. You would do this when
  *    you were not only running a pacing for output timer
  *    but a Rack timer as well i.e. you know you are in recovery
  *    and are in the process (via the timers) of dealing with
  *    the loss.
  *
  * Now a critical thing you must be aware of here is that the
  * use of the flags has a far greater scope then just your
  * typical LRO. Why? Well thats because in the normal compressed
  * LRO case at the end of a driver interupt all packets are going
  * to get presented to the transport no matter if there is one
  * or 100. With the MBUF_QUEUE model, this is not true. You will
  * only be awoken to process the queue of packets when:
  *     a) The flags discussed above allow it.
  *          <or>
  *     b) You exceed a ack or data limit (by default the
  *        ack limit is infinity (64k acks) and the data
  *        limit is 64k of new TCP data)
  *         <or>
  *     c) The push bit has been set by the peer
  */
 
 int
 ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt)
 {
 	/*
 	 * We are passed a raw change of mbuf packets
 	 * that arrived in LRO. They are linked via
 	 * the m_nextpkt link in the pkt-headers.
 	 *
 	 * We process each one by:
 	 * a) saving off the next
 	 * b) stripping off the ether-header
 	 * c) formulating the arguments for
 	 *    the tfb_tcp_hpts_do_segment
 	 * d) calling each mbuf to tfb_tcp_hpts_do_segment
 	 *    after adjusting the time to match the arrival time.
 	 * Note that the LRO code assures no IP options are present.
 	 *
 	 * The symantics for calling tfb_tcp_hpts_do_segment are the
 	 * following:
 	 * 1) It returns 0 if all went well and you (the caller) need
 	 *    to release the lock.
 	 * 2) If nxt_pkt is set, then the function will surpress calls
 	 *    to tcp_output() since you are promising to call again
 	 *    with another packet.
 	 * 3) If it returns 1, then you must free all the packets being
 	 *    shipped in, the tcb has been destroyed (or about to be destroyed).
 	 */
 	struct mbuf *m_save;
 	struct tcphdr *th;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
 #endif
 #ifdef INET
 	struct ip *ip = NULL;		/* Keep compiler happy. */
 #endif
 	struct ifnet *ifp;
 	struct timeval tv;
 	struct inpcb *inp __diagused;
 	int32_t retval, nxt_pkt, tlen, off;
 	int etype = 0;
 	uint16_t drop_hdrlen;
 	uint8_t iptos, no_vn=0;
 
 	inp = tptoinpcb(tp);
 	INP_WLOCK_ASSERT(inp);
 	NET_EPOCH_ASSERT();
 
 	if (m)
 		ifp = m_rcvif(m);
 	else
 		ifp = NULL;
 	if (ifp == NULL) {
 		/*
 		 * We probably should not work around
 		 * but kassert, since lro alwasy sets rcvif.
 		 */
 		no_vn = 1;
 		goto skip_vnet;
 	}
 	CURVNET_SET(ifp->if_vnet);
 skip_vnet:
 	tcp_get_usecs(&tv);
 	while (m) {
 		m_save = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		if ((m->m_flags & M_ACKCMP) == 0) {
 			/* Now lets get the ether header */
 			etype = ctf_get_enet_type(ifp, m);
 			if (etype == -1) {
 				/* Skip this packet it was freed by checksum */
 				goto skipped_pkt;
 			}
 			KASSERT(((etype == ETHERTYPE_IPV6) || (etype == ETHERTYPE_IP)),
 				("tp:%p m:%p etype:0x%x -- not IP or IPv6", tp, m, etype));
 			/* Trim off the ethernet header */
 			switch (etype) {
 #ifdef INET6
 			case ETHERTYPE_IPV6:
 				ip6 = mtod(m, struct ip6_hdr *);
 				th = (struct tcphdr *)(ip6 + 1);
 				tlen = ntohs(ip6->ip6_plen);
 				drop_hdrlen = sizeof(*ip6);
 				iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 				break;
 #endif
 #ifdef INET
 			case ETHERTYPE_IP:
 				ip = mtod(m, struct ip *);
 				th = (struct tcphdr *)(ip + 1);
 				drop_hdrlen = sizeof(*ip);
 				iptos = ip->ip_tos;
 				tlen = ntohs(ip->ip_len) - sizeof(struct ip);
 				break;
 #endif
 			} /* end switch */
 			/*
 			 * Convert TCP protocol specific fields to host format.
 			 */
 			tcp_fields_to_host(th);
 			off = th->th_off << 2;
 			if (off < sizeof (struct tcphdr) || off > tlen) {
 				printf("off:%d < hdrlen:%zu || > tlen:%u -- dump\n",
 				       off,
 				       sizeof(struct tcphdr),
 				       tlen);
 				KMOD_TCPSTAT_INC(tcps_rcvbadoff);
 				m_freem(m);
 				goto skipped_pkt;
 			}
 			tlen -= off;
 			drop_hdrlen += off;
 			/*
 			 * Now lets setup the timeval to be when we should
 			 * have been called (if we can).
 			 */
 			m->m_pkthdr.lro_nsegs = 1;
 			/* Now what about next packet? */
 		} else {
 			/*
 			 * This mbuf is an array of acks that have
 			 * been compressed. We assert the inp has
 			 * the flag set to enable this!
 			 */
 			KASSERT((inp->inp_flags2 & INP_MBUF_ACKCMP),
 			    ("tp:%p inp:%p no INP_MBUF_ACKCMP flags?", tp, inp));
 			tlen = 0;
 			drop_hdrlen = 0;
 			th = NULL;
 			iptos = 0;
 		}
 		tcp_get_usecs(&tv);
 		if (m_save || has_pkt)
 			nxt_pkt = 1;
 		else
 			nxt_pkt = 0;
 		if ((m->m_flags & M_ACKCMP) == 0)
 			KMOD_TCPSTAT_INC(tcps_rcvtotal);
 		else
 			KMOD_TCPSTAT_ADD(tcps_rcvtotal, (m->m_len / sizeof(struct tcp_ackent)));
 		retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen,
 							      iptos, nxt_pkt, &tv);
 		if (retval) {
 			/* We lost the lock and tcb probably */
 			m = m_save;
 			while(m) {
 				m_save = m->m_nextpkt;
 				m->m_nextpkt = NULL;
 				m_freem(m);
 				m = m_save;
 			}
 			if (no_vn == 0) {
 				CURVNET_RESTORE();
 			}
 			INP_UNLOCK_ASSERT(inp);
 			return(retval);
 		}
 skipped_pkt:
 		m = m_save;
 	}
 	if (no_vn == 0) {
 		CURVNET_RESTORE();
 	}
 	return(retval);
 }
 
 int
 ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
 {
 	struct mbuf *m;
 
 	/* First lets see if we have old packets */
 	if (tp->t_in_pkt) {
 		m = tp->t_in_pkt;
 		tp->t_in_pkt = NULL;
 		tp->t_tail_pkt = NULL;
 		if (ctf_process_inbound_raw(tp, so, m, have_pkt)) {
 			/* We lost the tcpcb (maybe a RST came in)? */
 			return(1);
 		}
 	}
 	return (0);
 }
 
 uint32_t
 ctf_outstanding(struct tcpcb *tp)
 {
 	uint32_t bytes_out;
 
 	bytes_out = tp->snd_max - tp->snd_una;
 	if (tp->t_state < TCPS_ESTABLISHED)
 		bytes_out++;
 	if (tp->t_flags & TF_SENTFIN)
 		bytes_out++;
 	return (bytes_out);
 }
 
 uint32_t
 ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
 {
 	if (rc_sacked <= ctf_outstanding(tp))
 		return(ctf_outstanding(tp) - rc_sacked);
 	else {
 		return (0);
 	}
 }
 
 void
 ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
     int32_t rstreason, int32_t tlen)
 {
 	if (tp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(tptoinpcb(tp));
 	} else
 		tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 }
 
 void
 ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt)
 {
 	if ((ts != NULL) && (cnt != NULL) &&
 	    (tcp_ack_war_time_window > 0) &&
 	    (tcp_ack_war_cnt > 0)) {
 		/* We are possibly doing ack war prevention */
 		uint32_t cts;
 
 		/*
 		 * We use a msec tick here which gives us
 		 * roughly 49 days. We don't need the
 		 * precision of a microsecond timestamp which
 		 * would only give us hours.
 		 */
 		cts = tcp_ts_getticks();
 		if (TSTMP_LT((*ts), cts)) {
 			/* Timestamp is in the past */
 			*cnt = 0;
 			*ts = (cts + tcp_ack_war_time_window);
 		}
 		if (*cnt < tcp_ack_war_cnt) {
 			*cnt = (*cnt + 1);
 			tp->t_flags |= TF_ACKNOW;
 		} else
 			tp->t_flags &= ~TF_ACKNOW;
 	} else
 		tp->t_flags |= TF_ACKNOW;
 }
 
 /*
  * ctf_drop_checks returns 1 for you should not proceed. It places
  * in ret_val what should be returned 1/0 by the caller. The 1 indicates
  * that the TCB is unlocked and probably dropped. The 0 indicates the
  * TCB is still valid and locked.
  */
 int
 _ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th,
 		 struct tcpcb *tp, int32_t *tlenp,
 		 int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val,
 		 uint32_t *ts, uint32_t *cnt)
 {
 	int32_t todrop;
 	int32_t thflags;
 	int32_t tlen;
 
 	thflags = *thf;
 	tlen = *tlenp;
 	todrop = tp->rcv_nxt - th->th_seq;
 	if (todrop > 0) {
 		if (thflags & TH_SYN) {
 			thflags &= ~TH_SYN;
 			th->th_seq++;
 			if (th->th_urp > 1)
 				th->th_urp--;
 			else
 				thflags &= ~TH_URG;
 			todrop--;
 		}
 		/*
 		 * Following if statement from Stevens, vol. 2, p. 960.
 		 */
 		if (todrop > tlen
 		    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 			/*
 			 * Any valid FIN must be to the left of the window.
 			 * At this point the FIN must be a duplicate or out
 			 * of sequence; drop it.
 			 */
 			thflags &= ~TH_FIN;
 			/*
 			 * Send an ACK to resynchronize and drop any data.
 			 * But keep on processing for RST or ACK.
 			 */
 			ctf_ack_war_checks(tp, ts, cnt);
 			todrop = tlen;
 			KMOD_TCPSTAT_INC(tcps_rcvduppack);
 			KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
 		} else {
 			KMOD_TCPSTAT_INC(tcps_rcvpartduppack);
 			KMOD_TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
 		}
 		/*
 		 * DSACK - add SACK block for dropped range
 		 */
 		if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) {
 			/*
 			 * ACK now, as the next in-sequence segment
 			 * will clear the DSACK block again
 			 */
 			ctf_ack_war_checks(tp, ts, cnt);
 			if (tp->t_flags & TF_ACKNOW)
 				tcp_update_sack_list(tp, th->th_seq,
 						     th->th_seq + todrop);
 		}
 		*drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
 		if (th->th_urp > todrop)
 			th->th_urp -= todrop;
 		else {
 			thflags &= ~TH_URG;
 			th->th_urp = 0;
 		}
 	}
 	/*
 	 * If segment ends after window, drop trailing data (and PUSH and
 	 * FIN); if nothing left, just ACK.
 	 */
 	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
 	if (todrop > 0) {
 		KMOD_TCPSTAT_INC(tcps_rcvpackafterwin);
 		if (todrop >= tlen) {
 			KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
 			/*
 			 * If window is closed can only take segments at
 			 * window edge, and have to drop data and PUSH from
 			 * incoming segments.  Continue processing, but
 			 * remember to ack.  Otherwise, drop segment and
 			 * ack.
 			 */
 			if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 				ctf_ack_war_checks(tp, ts, cnt);
 				KMOD_TCPSTAT_INC(tcps_rcvwinprobe);
 			} else {
 				__ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, ts, cnt);
 				return (1);
 			}
 		} else
 			KMOD_TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 		m_adj(m, -todrop);
 		tlen -= todrop;
 		thflags &= ~(TH_PUSH | TH_FIN);
 	}
 	*thf = thflags;
 	*tlenp = tlen;
 	return (0);
 }
 
 /*
  * The value in ret_val informs the caller
  * if we dropped the tcb (and lock) or not.
  * 1 = we dropped it, 0 = the TCB is still locked
  * and valid.
  */
 void
 __ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t *ret_val, uint32_t *ts, uint32_t *cnt)
 {
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies sequence
 	 * space, where the ACK reflects our state.
 	 *
 	 * We can now skip the test for the RST flag since all paths to this
 	 * code happen after packets containing RST have been dropped.
 	 *
 	 * In the SYN-RECEIVED state, don't send an ACK unless the segment
 	 * we received passes the SYN-RECEIVED ACK test. If it fails send a
 	 * RST.  This breaks the loop in the "LAND" DoS attack, and also
 	 * prevents an ACK storm between two listening ports that have been
 	 * sent forged SYN segments, each with the source address of the
 	 * other.
 	 */
 	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 	    (SEQ_GT(tp->snd_una, th->th_ack) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
 		*ret_val = 1;
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return;
 	} else
 		*ret_val = 0;
 	ctf_ack_war_checks(tp, ts, cnt);
 	if (m)
 		m_freem(m);
 }
 
 void
 ctf_do_drop(struct mbuf *m, struct tcpcb *tp)
 {
 
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
 	if (tp != NULL)
 		INP_WUNLOCK(tptoinpcb(tp));
 	if (m)
 		m_freem(m);
 }
 
 int
 __ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		struct tcpcb *tp, uint32_t *ts, uint32_t *cnt)
 {
 	/*
 	 * RFC5961 Section 3.2
 	 *
 	 * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
 	 * window, we send challenge ACK.
 	 *
 	 * Note: to take into account delayed ACKs, we should test against
 	 * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
 	 * of closed window, not covered by the RFC.
 	 */
 	int dropped = 0;
 
 	if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
 	    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
 		KASSERT(tp->t_state != TCPS_SYN_SENT,
 		    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
 		    __func__, th, tp));
 
 		if (V_tcp_insecure_rst ||
 		    (tp->last_ack_sent == th->th_seq) ||
 		    (tp->rcv_nxt == th->th_seq)) {
 			KMOD_TCPSTAT_INC(tcps_drops);
 			/* Drop the connection. */
 			switch (tp->t_state) {
 			case TCPS_SYN_RECEIVED:
 				so->so_error = ECONNREFUSED;
 				goto close;
 			case TCPS_ESTABLISHED:
 			case TCPS_FIN_WAIT_1:
 			case TCPS_FIN_WAIT_2:
 			case TCPS_CLOSE_WAIT:
 			case TCPS_CLOSING:
 			case TCPS_LAST_ACK:
 				so->so_error = ECONNRESET;
 		close:
 				tcp_state_change(tp, TCPS_CLOSED);
 				/* FALLTHROUGH */
 			default:
 				tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_RST);
 				tp = tcp_close(tp);
 			}
 			dropped = 1;
 			ctf_do_drop(m, tp);
 		} else {
 			int send_challenge;
 
 			KMOD_TCPSTAT_INC(tcps_badrst);
 			if ((ts != NULL) && (cnt != NULL) &&
 			    (tcp_ack_war_time_window > 0) &&
 			    (tcp_ack_war_cnt > 0)) {
 				/* We are possibly preventing an  ack-rst  war prevention */
 				uint32_t cts;
 
 				/*
 				 * We use a msec tick here which gives us
 				 * roughly 49 days. We don't need the
 				 * precision of a microsecond timestamp which
 				 * would only give us hours.
 				 */
 				cts = tcp_ts_getticks();
 				if (TSTMP_LT((*ts), cts)) {
 					/* Timestamp is in the past */
 					*cnt = 0;
 					*ts = (cts + tcp_ack_war_time_window);
 				}
 				if (*cnt < tcp_ack_war_cnt) {
 					*cnt = (*cnt + 1);
 					send_challenge = 1;
 				} else
 					send_challenge = 0;
 			} else
 				send_challenge = 1;
 			if (send_challenge) {
 				/* Send challenge ACK. */
 				tcp_respond(tp, mtod(m, void *), th, m,
 					    tp->rcv_nxt, tp->snd_nxt, TH_ACK);
 				tp->last_ack_sent = tp->rcv_nxt;
 			}
 		}
 	} else {
 		m_freem(m);
 	}
 	return (dropped);
 }
 
 /*
  * The value in ret_val informs the caller
  * if we dropped the tcb (and lock) or not.
  * 1 = we dropped it, 0 = the TCB is still locked
  * and valid.
  */
 void
 ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, uint8_t iptos, int32_t * ret_val)
 {
 
 	NET_EPOCH_ASSERT();
 
 	KMOD_TCPSTAT_INC(tcps_badsyn);
 	if (V_tcp_insecure_syn &&
 	    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
 	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 		tp = tcp_drop(tp, ECONNRESET);
 		*ret_val = 1;
 		ctf_do_drop(m, tp);
 	} else {
 		tcp_ecn_input_syn_sent(tp, tcp_get_flags(th), iptos);
 		/* Send challenge ACK. */
 		tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
 		    tp->snd_nxt, TH_ACK);
 		tp->last_ack_sent = tp->rcv_nxt;
 		m = NULL;
 		*ret_val = 0;
 		ctf_do_drop(m, NULL);
 	}
 }
 
 /*
  * ctf_ts_check returns 1 for you should not proceed, the state
  * machine should return. It places in ret_val what should
  * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
  * that the TCB is unlocked and probably dropped. The 0 indicates the
  * TCB is still valid and locked.
  */
 int
 ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
     int32_t tlen, int32_t thflags, int32_t * ret_val)
 {
 
 	if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
 		/*
 		 * Invalidate ts_recent.  If this segment updates ts_recent,
 		 * the age will be reset later and ts_recent will get a
 		 * valid value.  If it does not, setting ts_recent to zero
 		 * will at least satisfy the requirement that zero be placed
 		 * in the timestamp echo reply when ts_recent isn't valid.
 		 * The age isn't reset until we get a valid ts_recent
 		 * because we don't want out-of-order segments to be dropped
 		 * when ts_recent is old.
 		 */
 		tp->ts_recent = 0;
 	} else {
 		KMOD_TCPSTAT_INC(tcps_rcvduppack);
 		KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
 		KMOD_TCPSTAT_INC(tcps_pawsdrop);
 		*ret_val = 0;
 		if (tlen) {
 			ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
 		} else {
 			ctf_do_drop(m, NULL);
 		}
 		return (1);
 	}
 	return (0);
 }
 
 int
 ctf_ts_check_ac(struct tcpcb *tp, int32_t thflags)
 {
 
 	if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
 		/*
 		 * Invalidate ts_recent.  If this segment updates ts_recent,
 		 * the age will be reset later and ts_recent will get a
 		 * valid value.  If it does not, setting ts_recent to zero
 		 * will at least satisfy the requirement that zero be placed
 		 * in the timestamp echo reply when ts_recent isn't valid.
 		 * The age isn't reset until we get a valid ts_recent
 		 * because we don't want out-of-order segments to be dropped
 		 * when ts_recent is old.
 		 */
 		tp->ts_recent = 0;
 	} else {
 		KMOD_TCPSTAT_INC(tcps_rcvduppack);
 		KMOD_TCPSTAT_INC(tcps_pawsdrop);
 		return (1);
 	}
 	return (0);
 }
 
 
 
 void
 ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
 {
 	int32_t win;
 
 	/*
 	 * Calculate amount of space in receive window, and then do TCP
 	 * input processing. Receive window is amount of space in rcv queue,
 	 * but not less than advertised window.
 	 */
 	win = sbspace(&so->so_rcv);
 	if (win < 0)
 		win = 0;
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 }
 
 void
 ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
     int32_t rstreason, int32_t tlen)
 {
 
 	tcp_dropwithreset(m, th, tp, tlen, rstreason);
 	tp = tcp_drop(tp, ETIMEDOUT);
 	if (tp)
 		INP_WUNLOCK(tptoinpcb(tp));
 }
 
 uint32_t
 ctf_fixed_maxseg(struct tcpcb *tp)
 {
 	return (tcp_fixed_maxseg(tp));
 }
 
 void
 ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks)
 {
 	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 		union tcp_log_stackspecific log;
 		struct timeval tv;
 
 		memset(&log, 0, sizeof(log));
 		log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 		log.u_bbr.flex8 = num_sack_blks;
 		if (num_sack_blks > 0) {
 			log.u_bbr.flex1 = sack_blocks[0].start;
 			log.u_bbr.flex2 = sack_blocks[0].end;
 		}
 		if (num_sack_blks > 1) {
 			log.u_bbr.flex3 = sack_blocks[1].start;
 			log.u_bbr.flex4 = sack_blocks[1].end;
 		}
 		if (num_sack_blks > 2) {
 			log.u_bbr.flex5 = sack_blocks[2].start;
 			log.u_bbr.flex6 = sack_blocks[2].end;
 		}
 		if (num_sack_blks > 3) {
 			log.u_bbr.applimited = sack_blocks[3].start;
 			log.u_bbr.pkts_out = sack_blocks[3].end;
 		}
 		TCP_LOG_EVENTP(tp, NULL,
 		    &tptosocket(tp)->so_rcv,
 		    &tptosocket(tp)->so_snd,
 		    TCP_SACK_FILTER_RES, 0,
 		    0, &log, false, &tv);
 	}
 }
 
 uint32_t
 ctf_decay_count(uint32_t count, uint32_t decay)
 {
 	/*
 	 * Given a count, decay it by a set percentage. The
 	 * percentage is in thousands i.e. 100% = 1000,
 	 * 19.3% = 193.
 	 */
 	uint64_t perc_count, decay_per;
 	uint32_t decayed_count;
 	if (decay > 1000) {
 		/* We don't raise it */
 		return (count);
 	}
 	perc_count = count;
 	decay_per = decay;
 	perc_count *= decay_per;
 	perc_count /= 1000;
 	/*
 	 * So now perc_count holds the
 	 * count decay value.
 	 */
 	decayed_count = count - (uint32_t)perc_count;
 	return(decayed_count);
 }
 
 int32_t
 ctf_progress_timeout_check(struct tcpcb *tp, bool log)
 {
 	if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
 		if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
 			/*
 			 * There is an assumption that the caller
 			 * will drop the connection so we will
 			 * increment the counters here.
 			 */
 			if (log)
 				tcp_log_end_status(tp, TCP_EI_STATUS_PROGRESS);
 #ifdef NETFLIX_STATS
 			KMOD_TCPSTAT_INC(tcps_progdrops);
 #endif
 			return (1);
 		}
 	}
 	return (0);
 }
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 99fcd1f90a8a..01fd5eed34c8 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1,3968 +1,3969 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/arb.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
 #endif
 #include <sys/kernel.h>
 #ifdef TCP_HHOOK
 #include <sys/khelp.h>
 #endif
 #ifdef KERN_TLS
 #include <sys/ktls.h>
 #endif
 #include <sys/qmath.h>
 #include <sys/stats.h>
 #include <sys/sysctl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/refcount.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #include <netinet/tcp.h>
 #ifdef INVARIANTS
 #define TCPSTATES
 #endif
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_ecn.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcpip.h>
 #include <netinet/tcp_fastopen.h>
 #ifdef TCPPCAP
 #include <netinet/tcp_pcap.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 #include <crypto/siphash/siphash.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef INET6
 static ip6proto_ctlinput_t tcp6_ctlinput;
 static udp_tun_icmp_t tcp6_ctlinput_viaudp;
 #endif
 
 VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
 #ifdef INET6
 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
 #endif
 
 #ifdef NETFLIX_EXP_DETECTION
 /*  Sack attack detection thresholds and such */
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack_attack,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Sack Attack detection thresholds");
 int32_t tcp_force_detection = 0;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection,
     CTLFLAG_RW,
     &tcp_force_detection, 0,
     "Do we force detection even if the INP has it off?");
 int32_t tcp_sack_to_ack_thresh = 700;	/* 70 % */
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh,
     CTLFLAG_RW,
     &tcp_sack_to_ack_thresh, 700,
     "Percentage of sacks to acks we must see above (10.1 percent is 101)?");
 int32_t tcp_sack_to_move_thresh = 600;	/* 60 % */
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, move_thresh,
     CTLFLAG_RW,
     &tcp_sack_to_move_thresh, 600,
     "Percentage of sack moves we must see above (10.1 percent is 101)");
 int32_t tcp_restoral_thresh = 650;	/* 65 % (sack:2:ack -5%) */
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, restore_thresh,
     CTLFLAG_RW,
     &tcp_restoral_thresh, 550,
     "Percentage of sack to ack percentage we must see below to restore(10.1 percent is 101)");
 int32_t tcp_sad_decay_val = 800;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, decay_per,
     CTLFLAG_RW,
     &tcp_sad_decay_val, 800,
     "The decay percentage (10.1 percent equals 101 )");
 int32_t tcp_map_minimum = 500;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, nummaps,
     CTLFLAG_RW,
     &tcp_map_minimum, 500,
     "Number of Map enteries before we start detection");
 int32_t tcp_attack_on_turns_on_logging = 0;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, attacks_logged,
     CTLFLAG_RW,
     &tcp_attack_on_turns_on_logging, 0,
    "When we have a positive hit on attack, do we turn on logging?");
 int32_t tcp_sad_pacing_interval = 2000;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_pacing_int,
     CTLFLAG_RW,
     &tcp_sad_pacing_interval, 2000,
     "What is the minimum pacing interval for a classified attacker?");
 
 int32_t tcp_sad_low_pps = 100;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps,
     CTLFLAG_RW,
     &tcp_sad_low_pps, 100,
     "What is the input pps that below which we do not decay?");
 #endif
 uint32_t tcp_ack_war_time_window = 1000;
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow,
     CTLFLAG_RW,
     &tcp_ack_war_time_window, 1000,
    "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?");
 uint32_t tcp_ack_war_cnt = 5;
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt,
     CTLFLAG_RW,
     &tcp_ack_war_cnt, 5,
    "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?");
 
 struct rwlock tcp_function_lock;
 
 static int
 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I",
     "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 static int
 sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_v6mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_v6mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I",
    "Default TCP Maximum Segment Size for IPv6");
 #endif /* INET6 */
 
 /*
  * Minimum MSS we accept and use. This prevents DoS attacks where
  * we are forced to a ridiculous low MSS like 20 and send hundreds
  * of packets instead of one. The effect scales with the available
  * bandwidth and quickly saturates the CPU and network interface
  * with packet generation and sending. Set to zero to disable MINMSS
  * checking. This setting prevents us from sending too small packets.
  */
 VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW,
      &VNET_NAME(tcp_minmss), 0,
     "Minimum TCP Maximum Segment Size");
 
 VNET_DEFINE(int, tcp_do_rfc1323) = 1;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc1323), 0,
     "Enable rfc1323 (high performance TCP) extensions");
 
 /*
  * As of June 2021, several TCP stacks violate RFC 7323 from September 2014.
  * Some stacks negotiate TS, but never send them after connection setup. Some
  * stacks negotiate TS, but don't send them when sending keep-alive segments.
  * These include modern widely deployed TCP stacks.
  * Therefore tolerating violations for now...
  */
 VNET_DEFINE(int, tcp_tolerate_missing_ts) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_tolerate_missing_ts), 0,
     "Tolerate missing TCP timestamps");
 
 VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_ts_offset_per_conn), 0,
     "Initialize TCP timestamps per connection instead of per host pair");
 
 /* How many connections are pacing */
 static volatile uint32_t number_of_tcp_connections_pacing = 0;
 static uint32_t shadow_num_connections = 0;
 
 static int tcp_pacing_limit = 10000;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW,
     &tcp_pacing_limit, 1000,
     "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)");
 
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD,
     &shadow_num_connections, 0, "Number of TCP connections being paced");
 
 static int	tcp_log_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
     &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
 
 static int	tcp_tcbhashsize;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
     "Enable tcp_drain routine for extra help when low on mbufs");
 
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
 
 VNET_DEFINE_STATIC(int, icmp_may_rst) = 1;
 #define	V_icmp_may_rst			VNET(icmp_may_rst)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(icmp_may_rst), 0,
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0;
 #define	V_tcp_isn_reseed_interval	VNET(tcp_isn_reseed_interval)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_isn_reseed_interval), 0,
     "Seconds between reseeding of ISN secret");
 
 static int	tcp_soreceive_stream;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
     &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");
 
 VNET_DEFINE(uma_zone_t, sack_hole_zone);
 #define	V_sack_hole_zone		VNET(sack_hole_zone)
 VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0;	/* unlimited */
 static int
 sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_tcp_map_entries_limit;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		/* only allow "0" and value > minimum */
 		if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT)
 			error = EINVAL;
 		else
 			V_tcp_map_entries_limit = new;
 	}
 	return (error);
 }
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(tcp_map_entries_limit), 0,
     &sysctl_net_inet_tcp_map_limit_check, "IU",
     "Total sendmap entries limit");
 
 VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0;	/* unlimited */
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW,
      &VNET_NAME(tcp_map_split_limit), 0,
     "Total sendmap split entries limit");
 
 #ifdef TCP_HHOOK
 VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
 #endif
 
 #define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH
 VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]);
 #define	V_ts_offset_secret	VNET(ts_offset_secret)
 
 static int	tcp_default_fb_init(struct tcpcb *tp);
 static void	tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
 static int	tcp_default_handoff_ok(struct tcpcb *tp);
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc(struct inpcb *, int);
 static struct inpcb *tcp_drop_syn_sent(struct inpcb *, int);
 static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
 		    const void *ip4hdr, const void *ip6hdr);
 static ipproto_ctlinput_t	tcp_ctlinput;
 static udp_tun_icmp_t		tcp_ctlinput_viaudp;
 
 static struct tcp_function_block tcp_def_funcblk = {
 	.tfb_tcp_block_name = "freebsd",
 	.tfb_tcp_output = tcp_default_output,
 	.tfb_tcp_do_segment = tcp_do_segment,
 	.tfb_tcp_ctloutput = tcp_default_ctloutput,
 	.tfb_tcp_handoff_ok = tcp_default_handoff_ok,
 	.tfb_tcp_fb_init = tcp_default_fb_init,
 	.tfb_tcp_fb_fini = tcp_default_fb_fini,
 };
 
 static int tcp_fb_cnt = 0;
 struct tcp_funchead t_functions;
 static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk;
 
 void
 tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp)
 {
 	TCPSTAT_INC(tcps_dsack_count);
 	tp->t_dsack_pack++;
 	if (tlp == 0) {
 		if (SEQ_GT(end, start)) {
 			tp->t_dsack_bytes += (end - start);
 			TCPSTAT_ADD(tcps_dsack_bytes, (end - start));
 		} else {
 			tp->t_dsack_tlp_bytes += (start - end);
 			TCPSTAT_ADD(tcps_dsack_bytes, (start - end));
 		}
 	} else {
 		if (SEQ_GT(end, start)) {
 			tp->t_dsack_bytes += (end - start);
 			TCPSTAT_ADD(tcps_dsack_tlp_bytes, (end - start));
 		} else {
 			tp->t_dsack_tlp_bytes += (start - end);
 			TCPSTAT_ADD(tcps_dsack_tlp_bytes, (start - end));
 		}
 	}
 }
 
 static struct tcp_function_block *
 find_tcp_functions_locked(struct tcp_function_set *fs)
 {
 	struct tcp_function *f;
 	struct tcp_function_block *blk=NULL;
 
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if (strcmp(f->tf_name, fs->function_set_name) == 0) {
 			blk = f->tf_fb;
 			break;
 		}
 	}
 	return(blk);
 }
 
 static struct tcp_function_block *
 find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s)
 {
 	struct tcp_function_block *rblk=NULL;
 	struct tcp_function *f;
 
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if (f->tf_fb == blk) {
 			rblk = blk;
 			if (s) {
 				*s = f;
 			}
 			break;
 		}
 	}
 	return (rblk);
 }
 
 struct tcp_function_block *
 find_and_ref_tcp_functions(struct tcp_function_set *fs)
 {
 	struct tcp_function_block *blk;
 
 	rw_rlock(&tcp_function_lock);
 	blk = find_tcp_functions_locked(fs);
 	if (blk)
 		refcount_acquire(&blk->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	return(blk);
 }
 
 struct tcp_function_block *
 find_and_ref_tcp_fb(struct tcp_function_block *blk)
 {
 	struct tcp_function_block *rblk;
 
 	rw_rlock(&tcp_function_lock);
 	rblk = find_tcp_fb_locked(blk, NULL);
 	if (rblk)
 		refcount_acquire(&rblk->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	return(rblk);
 }
 
 /* Find a matching alias for the given tcp_function_block. */
 int
 find_tcp_function_alias(struct tcp_function_block *blk,
     struct tcp_function_set *fs)
 {
 	struct tcp_function *f;
 	int found;
 
 	found = 0;
 	rw_rlock(&tcp_function_lock);
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if ((f->tf_fb == blk) &&
 		    (strncmp(f->tf_name, blk->tfb_tcp_block_name,
 		        TCP_FUNCTION_NAME_LEN_MAX) != 0)) {
 			/* Matching function block with different name. */
 			strncpy(fs->function_set_name, f->tf_name,
 			    TCP_FUNCTION_NAME_LEN_MAX);
 			found = 1;
 			break;
 		}
 	}
 	/* Null terminate the string appropriately. */
 	if (found) {
 		fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
 	} else {
 		fs->function_set_name[0] = '\0';
 	}
 	rw_runlock(&tcp_function_lock);
 	return (found);
 }
 
 static struct tcp_function_block *
 find_and_ref_tcp_default_fb(void)
 {
 	struct tcp_function_block *rblk;
 
 	rw_rlock(&tcp_function_lock);
 	rblk = tcp_func_set_ptr;
 	refcount_acquire(&rblk->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	return (rblk);
 }
 
 void
 tcp_switch_back_to_default(struct tcpcb *tp)
 {
 	struct tcp_function_block *tfb;
 
 	KASSERT(tp->t_fb != &tcp_def_funcblk,
 	    ("%s: called by the built-in default stack", __func__));
 
 	/*
 	 * Release the old stack. This function will either find a new one
 	 * or panic.
 	 */
 	if (tp->t_fb->tfb_tcp_fb_fini != NULL)
 		(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
 	refcount_release(&tp->t_fb->tfb_refcnt);
 
 	/*
 	 * Now, we'll find a new function block to use.
 	 * Start by trying the current user-selected
 	 * default, unless this stack is the user-selected
 	 * default.
 	 */
 	tfb = find_and_ref_tcp_default_fb();
 	if (tfb == tp->t_fb) {
 		refcount_release(&tfb->tfb_refcnt);
 		tfb = NULL;
 	}
 	/* Does the stack accept this connection? */
 	if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL &&
 	    (*tfb->tfb_tcp_handoff_ok)(tp)) {
 		refcount_release(&tfb->tfb_refcnt);
 		tfb = NULL;
 	}
 	/* Try to use that stack. */
 	if (tfb != NULL) {
 		/* Initialize the new stack. If it succeeds, we are done. */
 		tp->t_fb = tfb;
 		if (tp->t_fb->tfb_tcp_fb_init == NULL ||
 		    (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
 			return;
 
 		/*
 		 * Initialization failed. Release the reference count on
 		 * the stack.
 		 */
 		refcount_release(&tfb->tfb_refcnt);
 	}
 
 	/*
 	 * If that wasn't feasible, use the built-in default
 	 * stack which is not allowed to reject anyone.
 	 */
 	tfb = find_and_ref_tcp_fb(&tcp_def_funcblk);
 	if (tfb == NULL) {
 		/* there always should be a default */
 		panic("Can't refer to tcp_def_funcblk");
 	}
 	if (tfb->tfb_tcp_handoff_ok != NULL) {
 		if ((*tfb->tfb_tcp_handoff_ok) (tp)) {
 			/* The default stack cannot say no */
 			panic("Default stack rejects a new session?");
 		}
 	}
 	tp->t_fb = tfb;
 	if (tp->t_fb->tfb_tcp_fb_init != NULL &&
 	    (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
 		/* The default stack cannot fail */
 		panic("Default stack initialization failed");
 	}
 }
 
 static bool
 tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,
     const struct sockaddr *sa, void *ctx)
 {
 	struct ip *iph;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct udphdr *uh;
 	struct tcphdr *th;
 	int thlen;
 	uint16_t port;
 
 	TCPSTAT_INC(tcps_tunneled_pkts);
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		/* Can't handle one that is not a pkt hdr */
 		TCPSTAT_INC(tcps_tunneled_errs);
 		goto out;
 	}
 	thlen = sizeof(struct tcphdr);
 	if (m->m_len < off + sizeof(struct udphdr) + thlen &&
 	    (m =  m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) {
 		TCPSTAT_INC(tcps_tunneled_errs);
 		goto out;
 	}
 	iph = mtod(m, struct ip *);
 	uh = (struct udphdr *)((caddr_t)iph + off);
 	th = (struct tcphdr *)(uh + 1);
 	thlen = th->th_off << 2;
 	if (m->m_len < off + sizeof(struct udphdr) + thlen) {
 		m =  m_pullup(m, off + sizeof(struct udphdr) + thlen);
 		if (m == NULL) {
 			TCPSTAT_INC(tcps_tunneled_errs);
 			goto out;
 		} else {
 			iph = mtod(m, struct ip *);
 			uh = (struct udphdr *)((caddr_t)iph + off);
 			th = (struct tcphdr *)(uh + 1);
 		}
 	}
 	m->m_pkthdr.tcp_tun_port = port = uh->uh_sport;
 	bcopy(th, uh, m->m_len - off);
 	m->m_len -= sizeof(struct udphdr);
 	m->m_pkthdr.len -= sizeof(struct udphdr);
 	/*
 	 * We use the same algorithm for
 	 * both UDP and TCP for c-sum. So
 	 * the code in tcp_input will skip
 	 * the checksum. So we do nothing
 	 * with the flag (m->m_pkthdr.csum_flags).
 	 */
 	switch (iph->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr));
 		tcp_input_with_port(&m, &off, IPPROTO_TCP, port);
 		break;
 #endif
 #ifdef INET6
 	case IPV6_VERSION >> 4:
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr));
 		tcp6_input_with_port(&m, &off, IPPROTO_TCP, port);
 		break;
 #endif
 	default:
 		goto out;
 		break;
 	}
 	return (true);
 out:
 	m_freem(m);
 
 	return (true);
 }
 
 static int
 sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
 {
 	int error=ENOENT;
 	struct tcp_function_set fs;
 	struct tcp_function_block *blk;
 
 	memset(&fs, 0, sizeof(fs));
 	rw_rlock(&tcp_function_lock);
 	blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL);
 	if (blk) {
 		/* Found him */
 		strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
 		fs.pcbcnt = blk->tfb_refcnt;
 	}
 	rw_runlock(&tcp_function_lock);
 	error = sysctl_handle_string(oidp, fs.function_set_name,
 				     sizeof(fs.function_set_name), req);
 
 	/* Check for error or no change */
 	if (error != 0 || req->newptr == NULL)
 		return(error);
 
 	rw_wlock(&tcp_function_lock);
 	blk = find_tcp_functions_locked(&fs);
 	if ((blk == NULL) ||
 	    (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) {
 		error = ENOENT;
 		goto done;
 	}
 	tcp_func_set_ptr = blk;
 done:
 	rw_wunlock(&tcp_function_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     NULL, 0, sysctl_net_inet_default_tcp_functions, "A",
     "Set/get the default TCP functions");
 
 static int
 sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS)
 {
 	int error, cnt, linesz;
 	struct tcp_function *f;
 	char *buffer, *cp;
 	size_t bufsz, outsz;
 	bool alias;
 
 	cnt = 0;
 	rw_rlock(&tcp_function_lock);
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		cnt++;
 	}
 	rw_runlock(&tcp_function_lock);
 
 	bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1;
 	buffer = malloc(bufsz, M_TEMP, M_WAITOK);
 
 	error = 0;
 	cp = buffer;
 
 	linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D',
 	    "Alias", "PCB count");
 	cp += linesz;
 	bufsz -= linesz;
 	outsz = linesz;
 
 	rw_rlock(&tcp_function_lock);
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name);
 		linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n",
 		    f->tf_fb->tfb_tcp_block_name,
 		    (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ',
 		    alias ? f->tf_name : "-",
 		    f->tf_fb->tfb_refcnt);
 		if (linesz >= bufsz) {
 			error = EOVERFLOW;
 			break;
 		}
 		cp += linesz;
 		bufsz -= linesz;
 		outsz += linesz;
 	}
 	rw_runlock(&tcp_function_lock);
 	if (error == 0)
 		error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
 	free(buffer, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
     NULL, 0, sysctl_net_inet_list_available, "A",
     "list available TCP Function sets");
 
 VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT;
 
 #ifdef INET
 VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL;
 #define	V_udp4_tun_socket	VNET(udp4_tun_socket)
 #endif
 #ifdef INET6
 VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL;
 #define	V_udp6_tun_socket	VNET(udp6_tun_socket)
 #endif
 
 static struct sx tcpoudp_lock;
 
 static void
 tcp_over_udp_stop(void)
 {
 
 	sx_assert(&tcpoudp_lock, SA_XLOCKED);
 
 #ifdef INET
 	if (V_udp4_tun_socket != NULL) {
 		soclose(V_udp4_tun_socket);
 		V_udp4_tun_socket = NULL;
 	}
 #endif
 #ifdef INET6
 	if (V_udp6_tun_socket != NULL) {
 		soclose(V_udp6_tun_socket);
 		V_udp6_tun_socket = NULL;
 	}
 #endif
 }
 
 static int
 tcp_over_udp_start(void)
 {
 	uint16_t port;
 	int ret;
 #ifdef INET
 	struct sockaddr_in sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 #endif
 
 	sx_assert(&tcpoudp_lock, SA_XLOCKED);
 
 	port = V_tcp_udp_tunneling_port;
 	if (ntohs(port) == 0) {
 		/* Must have a port set */
 		return (EINVAL);
 	}
 #ifdef INET
 	if (V_udp4_tun_socket != NULL) {
 		/* Already running -- must stop first */
 		return (EALREADY);
 	}
 #endif
 #ifdef INET6
 	if (V_udp6_tun_socket != NULL) {
 		/* Already running -- must stop first */
 		return (EALREADY);
 	}
 #endif
 #ifdef INET
 	if ((ret = socreate(PF_INET, &V_udp4_tun_socket,
 	    SOCK_DGRAM, IPPROTO_UDP,
 	    curthread->td_ucred, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Call the special UDP hook. */
 	if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket,
 	    tcp_recv_udp_tunneled_packet,
 	    tcp_ctlinput_viaudp,
 	    NULL))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Ok, we have a socket, bind it to the port. */
 	memset(&sin, 0, sizeof(struct sockaddr_in));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_port = htons(port);
 	if ((ret = sobind(V_udp4_tun_socket,
 	    (struct sockaddr *)&sin, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 #endif
 #ifdef INET6
 	if ((ret = socreate(PF_INET6, &V_udp6_tun_socket,
 	    SOCK_DGRAM, IPPROTO_UDP,
 	    curthread->td_ucred, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Call the special UDP hook. */
 	if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket,
 	    tcp_recv_udp_tunneled_packet,
 	    tcp6_ctlinput_viaudp,
 	    NULL))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Ok, we have a socket, bind it to the port. */
 	memset(&sin6, 0, sizeof(struct sockaddr_in6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_port = htons(port);
 	if ((ret = sobind(V_udp6_tun_socket,
 	    (struct sockaddr *)&sin6, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 #endif
 	return (0);
 }
 
 static int
 sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t old, new;
 
 	old = V_tcp_udp_tunneling_port;
 	new = old;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if ((error == 0) &&
 	    (req->newptr != NULL)) {
 		if ((new < TCP_TUNNELING_PORT_MIN) ||
 		    (new > TCP_TUNNELING_PORT_MAX)) {
 			error = EINVAL;
 		} else {
 			sx_xlock(&tcpoudp_lock);
 			V_tcp_udp_tunneling_port = new;
 			if (old != 0) {
 				tcp_over_udp_stop();
 			}
 			if (new != 0) {
 				error = tcp_over_udp_start();
 				if (error != 0) {
 					V_tcp_udp_tunneling_port = 0;
 				}
 			}
 			sx_xunlock(&tcpoudp_lock);
 		}
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(tcp_udp_tunneling_port),
     0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU",
     "Tunneling port for tcp over udp");
 
 VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT;
 
 static int
 sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_udp_tunneling_overhead;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if ((new < TCP_TUNNELING_OVERHEAD_MIN) ||
 		    (new > TCP_TUNNELING_OVERHEAD_MAX))
 			error = EINVAL;
 		else
 			V_tcp_udp_tunneling_overhead = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(tcp_udp_tunneling_overhead),
     0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU",
     "MSS reduction when using tcp over udp");
 
 /*
  * Exports one (struct tcp_function_info) for each alias/name.
  */
 static int
 sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS)
 {
 	int cnt, error;
 	struct tcp_function *f;
 	struct tcp_function_info tfi;
 
 	/*
 	 * We don't allow writes.
 	 */
 	if (req->newptr != NULL)
 		return (EINVAL);
 
 	/*
 	 * Wire the old buffer so we can directly copy the functions to
 	 * user space without dropping the lock.
 	 */
 	if (req->oldptr != NULL) {
 		error = sysctl_wire_old_buffer(req, 0);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Walk the list and copy out matching entries. If INVARIANTS
 	 * is compiled in, also walk the list to verify the length of
 	 * the list matches what we have recorded.
 	 */
 	rw_rlock(&tcp_function_lock);
 
 	cnt = 0;
 #ifndef INVARIANTS
 	if (req->oldptr == NULL) {
 		cnt = tcp_fb_cnt;
 		goto skip_loop;
 	}
 #endif
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 #ifdef INVARIANTS
 		cnt++;
 #endif
 		if (req->oldptr != NULL) {
 			bzero(&tfi, sizeof(tfi));
 			tfi.tfi_refcnt = f->tf_fb->tfb_refcnt;
 			tfi.tfi_id = f->tf_fb->tfb_id;
 			(void)strlcpy(tfi.tfi_alias, f->tf_name,
 			    sizeof(tfi.tfi_alias));
 			(void)strlcpy(tfi.tfi_name,
 			    f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name));
 			error = SYSCTL_OUT(req, &tfi, sizeof(tfi));
 			/*
 			 * Don't stop on error, as that is the
 			 * mechanism we use to accumulate length
 			 * information if the buffer was too short.
 			 */
 		}
 	}
 	KASSERT(cnt == tcp_fb_cnt,
 	    ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt));
 #ifndef INVARIANTS
 skip_loop:
 #endif
 	rw_runlock(&tcp_function_lock);
 	if (req->oldptr == NULL)
 		error = SYSCTL_OUT(req, NULL,
 		    (cnt + 1) * sizeof(struct tcp_function_info));
 
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info,
 	    CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info",
 	    "List TCP function block name-to-ID mappings");
 
 /*
  * tfb_tcp_handoff_ok() function for the default stack.
  * Note that we'll basically try to take all comers.
  */
 static int
 tcp_default_handoff_ok(struct tcpcb *tp)
 {
 
 	return (0);
 }
 
 /*
  * tfb_tcp_fb_init() function for the default stack.
  *
  * This handles making sure we have appropriate timers set if you are
  * transitioning a socket that has some amount of setup done.
  *
  * The init() fuction from the default can *never* return non-zero i.e.
  * it is required to always succeed since it is the stack of last resort!
  */
 static int
 tcp_default_fb_init(struct tcpcb *tp)
 {
 	struct socket *so = tptosocket(tp);
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
 	    ("%s: connection %p in unexpected state %d", __func__, tp,
 	    tp->t_state));
 
 	/*
 	 * Nothing to do for ESTABLISHED or LISTEN states. And, we don't
 	 * know what to do for unexpected states (which includes TIME_WAIT).
 	 */
 	if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT)
 		return (0);
 
 	/*
 	 * Make sure some kind of transmission timer is set if there is
 	 * outstanding data.
 	 */
 	if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) ||
 	    tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) ||
 	    tcp_timer_active(tp, TT_PERSIST))) {
 		/*
 		 * If the session has established and it looks like it should
 		 * be in the persist state, set the persist timer. Otherwise,
 		 * set the retransmit timer.
 		 */
 		if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 &&
 		    (int32_t)(tp->snd_nxt - tp->snd_una) <
 		    (int32_t)sbavail(&so->so_snd))
 			tcp_setpersist(tp);
 		else
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 	}
 
 	/* All non-embryonic sessions get a keepalive timer. */
 	if (!tcp_timer_active(tp, TT_KEEP))
 		tcp_timer_activate(tp, TT_KEEP,
 		    TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) :
 		    TP_KEEPINIT(tp));
 
 	/*
 	 * Make sure critical variables are initialized
 	 * if transitioning while in Recovery.
 	 */
 	if IN_FASTRECOVERY(tp->t_flags) {
 		if (tp->sackhint.recover_fs == 0)
 			tp->sackhint.recover_fs = max(1,
 			    tp->snd_nxt - tp->snd_una);
 	}
 
 	return (0);
 }
 
 /*
  * tfb_tcp_fb_fini() function for the default stack.
  *
  * This changes state as necessary (or prudent) to prepare for another stack
  * to assume responsibility for the connection.
  */
 static void
 tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged)
 {
 
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 }
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	0
 #endif
 
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
 MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory");
 
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
 #define	ISN_LOCK()	mtx_lock(&isn_mtx)
 #define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)
 
 INPCBSTORAGE_DEFINE(tcpcbstor, tcpcb, "tcpinp", "tcp_inpcb", "tcp", "tcphash");
 
 /*
  * Take a value and get the next power of 2 that doesn't overflow.
  * Used to size the tcp_inpcb hash buckets.
  */
 static int
 maketcp_hashsize(int size)
 {
 	int hashsize;
 
 	/*
 	 * auto tune.
 	 * get the next power of 2 higher than maxsockets.
 	 */
 	hashsize = 1 << fls(size);
 	/* catch overflow, and just go one power of 2 smaller */
 	if (hashsize < size) {
 		hashsize = 1 << (fls(size) - 1);
 	}
 	return (hashsize);
 }
 
 static volatile int next_tcp_stack_id = 1;
 
 /*
  * Register a TCP function block with the name provided in the names
  * array.  (Note that this function does NOT automatically register
  * blk->tfb_tcp_block_name as a stack name.  Therefore, you should
  * explicitly include blk->tfb_tcp_block_name in the list of names if
  * you wish to register the stack with that name.)
  *
  * Either all name registrations will succeed or all will fail.  If
  * a name registration fails, the function will update the num_names
  * argument to point to the array index of the name that encountered
  * the failure.
  *
  * Returns 0 on success, or an error code on failure.
  */
 int
 register_tcp_functions_as_names(struct tcp_function_block *blk, int wait,
     const char *names[], int *num_names)
 {
 	struct tcp_function *n;
 	struct tcp_function_set fs;
 	int error, i;
 
 	KASSERT(names != NULL && *num_names > 0,
 	    ("%s: Called with 0-length name list", __func__));
 	KASSERT(names != NULL, ("%s: Called with NULL name list", __func__));
 	KASSERT(rw_initialized(&tcp_function_lock),
 	    ("%s: called too early", __func__));
 
 	if ((blk->tfb_tcp_output == NULL) ||
 	    (blk->tfb_tcp_do_segment == NULL) ||
 	    (blk->tfb_tcp_ctloutput == NULL) ||
 	    (strlen(blk->tfb_tcp_block_name) == 0)) {
 		/*
 		 * These functions are required and you
 		 * need a name.
 		 */
 		*num_names = 0;
 		return (EINVAL);
 	}
 
 	if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
 		*num_names = 0;
 		return (EINVAL);
 	}
 
 	refcount_init(&blk->tfb_refcnt, 0);
 	blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1);
 	for (i = 0; i < *num_names; i++) {
 		n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
 		if (n == NULL) {
 			error = ENOMEM;
 			goto cleanup;
 		}
 		n->tf_fb = blk;
 
 		(void)strlcpy(fs.function_set_name, names[i],
 		    sizeof(fs.function_set_name));
 		rw_wlock(&tcp_function_lock);
 		if (find_tcp_functions_locked(&fs) != NULL) {
 			/* Duplicate name space not allowed */
 			rw_wunlock(&tcp_function_lock);
 			free(n, M_TCPFUNCTIONS);
 			error = EALREADY;
 			goto cleanup;
 		}
 		(void)strlcpy(n->tf_name, names[i], sizeof(n->tf_name));
 		TAILQ_INSERT_TAIL(&t_functions, n, tf_next);
 		tcp_fb_cnt++;
 		rw_wunlock(&tcp_function_lock);
 	}
 	return(0);
 
 cleanup:
 	/*
 	 * Deregister the names we just added. Because registration failed
 	 * for names[i], we don't need to deregister that name.
 	 */
 	*num_names = i;
 	rw_wlock(&tcp_function_lock);
 	while (--i >= 0) {
 		TAILQ_FOREACH(n, &t_functions, tf_next) {
 			if (!strncmp(n->tf_name, names[i],
 			    TCP_FUNCTION_NAME_LEN_MAX)) {
 				TAILQ_REMOVE(&t_functions, n, tf_next);
 				tcp_fb_cnt--;
 				n->tf_fb = NULL;
 				free(n, M_TCPFUNCTIONS);
 				break;
 			}
 		}
 	}
 	rw_wunlock(&tcp_function_lock);
 	return (error);
 }
 
 /*
  * Register a TCP function block using the name provided in the name
  * argument.
  *
  * Returns 0 on success, or an error code on failure.
  */
 int
 register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name,
     int wait)
 {
 	const char *name_list[1];
 	int num_names, rv;
 
 	num_names = 1;
 	if (name != NULL)
 		name_list[0] = name;
 	else
 		name_list[0] = blk->tfb_tcp_block_name;
 	rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names);
 	return (rv);
 }
 
 /*
  * Register a TCP function block using the name defined in
  * blk->tfb_tcp_block_name.
  *
  * Returns 0 on success, or an error code on failure.
  */
 int
 register_tcp_functions(struct tcp_function_block *blk, int wait)
 {
 
 	return (register_tcp_functions_as_name(blk, NULL, wait));
 }
 
 /*
  * Deregister all names associated with a function block. This
  * functionally removes the function block from use within the system.
  *
  * When called with a true quiesce argument, mark the function block
  * as being removed so no more stacks will use it and determine
  * whether the removal would succeed.
  *
  * When called with a false quiesce argument, actually attempt the
  * removal.
  *
  * When called with a force argument, attempt to switch all TCBs to
  * use the default stack instead of returning EBUSY.
  *
  * Returns 0 on success (or if the removal would succeed, or an error
  * code on failure.
  */
 int
 deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
     bool force)
 {
 	struct tcp_function *f;
 
 	if (blk == &tcp_def_funcblk) {
 		/* You can't un-register the default */
 		return (EPERM);
 	}
 	rw_wlock(&tcp_function_lock);
 	if (blk == tcp_func_set_ptr) {
 		/* You can't free the current default */
 		rw_wunlock(&tcp_function_lock);
 		return (EBUSY);
 	}
 	/* Mark the block so no more stacks can use it. */
 	blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
 	/*
 	 * If TCBs are still attached to the stack, attempt to switch them
 	 * to the default stack.
 	 */
 	if (force && blk->tfb_refcnt) {
 		struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
 		    INPLOOKUP_WLOCKPCB);
 		struct inpcb *inp;
 		struct tcpcb *tp;
 		VNET_ITERATOR_DECL(vnet_iter);
 
 		rw_wunlock(&tcp_function_lock);
 
 		VNET_LIST_RLOCK();
 		VNET_FOREACH(vnet_iter) {
 			CURVNET_SET(vnet_iter);
 			while ((inp = inp_next(&inpi)) != NULL) {
 				tp = intotcpcb(inp);
 				if (tp == NULL || tp->t_fb != blk)
 					continue;
 				tcp_switch_back_to_default(tp);
 			}
 			CURVNET_RESTORE();
 		}
 		VNET_LIST_RUNLOCK();
 
 		rw_wlock(&tcp_function_lock);
 	}
 	if (blk->tfb_refcnt) {
 		/* TCBs still attached. */
 		rw_wunlock(&tcp_function_lock);
 		return (EBUSY);
 	}
 	if (quiesce) {
 		/* Skip removal. */
 		rw_wunlock(&tcp_function_lock);
 		return (0);
 	}
 	/* Remove any function names that map to this function block. */
 	while (find_tcp_fb_locked(blk, &f) != NULL) {
 		TAILQ_REMOVE(&t_functions, f, tf_next);
 		tcp_fb_cnt--;
 		f->tf_fb = NULL;
 		free(f, M_TCPFUNCTIONS);
 	}
 	rw_wunlock(&tcp_function_lock);
 	return (0);
 }
 
 static void
 tcp_drain(void)
 {
 	struct epoch_tracker et;
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	if (!do_tcpdrain)
 		return;
 
 	NET_EPOCH_ENTER(et);
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
 		    INPLOOKUP_WLOCKPCB);
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 *	where we're really low on mbufs, this is potentially
 	 *	useful.
 	 */
 		while ((inpb = inp_next(&inpi)) != NULL) {
 			if ((tcpb = intotcpcb(inpb)) != NULL) {
 				tcp_reass_flush(tcpb);
 				tcp_clean_sackreport(tcpb);
 #ifdef TCP_BLACKBOX
 				tcp_log_drain(tcpb);
 #endif
 #ifdef TCPPCAP
 				if (tcp_pcap_aggressive_free) {
 					/* Free the TCP PCAP queues. */
 					tcp_pcap_drain(&(tcpb->t_inpkts));
 					tcp_pcap_drain(&(tcpb->t_outpkts));
 				}
 #endif
 			}
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 tcp_vnet_init(void *arg __unused)
 {
 
 #ifdef TCP_HHOOK
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN,
 	    &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT,
 	    &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 #endif
 #ifdef STATS
 	if (tcp_stats_init())
 		printf("%s: WARNING: unable to initialise TCP stats\n",
 		    __func__);
 #endif
 	in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize,
 	    tcp_tcbhashsize);
 
 	syncache_init();
 	tcp_hc_init();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
 	V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 
 	tcp_fastopen_init();
 
 	COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK);
 	VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK);
 
 	V_tcp_msl = TCPTV_MSL;
 }
 VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
     tcp_vnet_init, NULL);
 
 static void
 tcp_init(void *arg __unused)
 {
 	const char *tcbhash_tuneable;
 	int hashsize;
 
 	tcp_reass_global_init();
 
 	/* XXX virtualize those below? */
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_rexmit_initial = TCPTV_RTOBASE;
 	if (tcp_rexmit_initial < 1)
 		tcp_rexmit_initial = 1;
 	tcp_rexmit_min = TCPTV_MIN;
 	if (tcp_rexmit_min < 1)
 		tcp_rexmit_min = 1;
 	tcp_persmin = TCPTV_PERSMIN;
 	tcp_persmax = TCPTV_PERSMAX;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 
 	/* Setup the tcp function block list */
 	TAILQ_INIT(&t_functions);
 	rw_init(&tcp_function_lock, "tcp_func_lock");
 	register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
 	sx_init(&tcpoudp_lock, "TCP over UDP configuration");
 #ifdef TCP_BLACKBOX
 	/* Initialize the TCP logging data. */
 	tcp_log_init();
 #endif
 	arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0);
 
 	if (tcp_soreceive_stream) {
 #ifdef INET
 		tcp_protosw.pr_soreceive = soreceive_stream;
 #endif
 #ifdef INET6
 		tcp6_protosw.pr_soreceive = soreceive_stream;
 #endif /* INET6 */
 	}
 
 #ifdef INET6
 	max_protohdr_grow(sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
 #else /* INET6 */
 	max_protohdr_grow(sizeof(struct tcpiphdr));
 #endif /* INET6 */
 
 	ISN_LOCK_INIT();
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 	EVENTHANDLER_REGISTER(vm_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT);
 	EVENTHANDLER_REGISTER(mbuf_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT);
 
 	tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK);
 	tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK);
 	tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK);
 	tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK);
 	tcp_extra_mbuf = counter_u64_alloc(M_WAITOK);
 	tcp_would_have_but = counter_u64_alloc(M_WAITOK);
 	tcp_comp_total = counter_u64_alloc(M_WAITOK);
 	tcp_uncomp_total = counter_u64_alloc(M_WAITOK);
 	tcp_bad_csums = counter_u64_alloc(M_WAITOK);
 #ifdef TCPPCAP
 	tcp_pcap_init();
 #endif
 
 	hashsize = TCBHASHSIZE;
 	tcbhash_tuneable = "net.inet.tcp.tcbhashsize";
 	TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
 	if (hashsize == 0) {
 		/*
 		 * Auto tune the hash size based on maxsockets.
 		 * A perfect hash would have a 1:1 mapping
 		 * (hashsize = maxsockets) however it's been
 		 * suggested that O(2) average is better.
 		 */
 		hashsize = maketcp_hashsize(maxsockets / 4);
 		/*
 		 * Our historical default is 512,
 		 * do not autotune lower than this.
 		 */
 		if (hashsize < 512)
 			hashsize = 512;
 		if (bootverbose)
 			printf("%s: %s auto tuned to %d\n", __func__,
 			    tcbhash_tuneable, hashsize);
 	}
 	/*
 	 * We require a hashsize to be a power of two.
 	 * Previously if it was not a power of two we would just reset it
 	 * back to 512, which could be a nasty surprise if you did not notice
 	 * the error message.
 	 * Instead what we do is clip it to the closest power of two lower
 	 * than the specified hash value.
 	 */
 	if (!powerof2(hashsize)) {
 		int oldhashsize = hashsize;
 
 		hashsize = maketcp_hashsize(hashsize);
 		/* prevent absurdly low value */
 		if (hashsize < 16)
 			hashsize = 16;
 		printf("%s: WARNING: TCB hash size not a power of 2, "
 		    "clipped from %d to %d.\n", __func__, oldhashsize,
 		    hashsize);
 	}
 	tcp_tcbhashsize = hashsize;
 
 #ifdef INET
 	IPPROTO_REGISTER(IPPROTO_TCP, tcp_input, tcp_ctlinput);
 #endif
 #ifdef INET6
 	IP6PROTO_REGISTER(IPPROTO_TCP, tcp6_input, tcp6_ctlinput);
 #endif
 }
 SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL);
 
 #ifdef VIMAGE
 static void
 tcp_destroy(void *unused __unused)
 {
 	int n;
 #ifdef TCP_HHOOK
 	int error;
 #endif
 
 	/*
 	 * All our processes are gone, all our sockets should be cleaned
 	 * up, which means, we should be past the tcp_discardcb() calls.
 	 * Sleep to let all tcpcb timers really disappear and cleanup.
 	 */
 	for (;;) {
 		INP_INFO_WLOCK(&V_tcbinfo);
 		n = V_tcbinfo.ipi_count;
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		if (n == 0)
 			break;
 		pause("tcpdes", hz / 10);
 	}
 	tcp_hc_destroy();
 	syncache_destroy();
 	in_pcbinfo_destroy(&V_tcbinfo);
 	/* tcp_discardcb() clears the sack_holes up. */
 	uma_zdestroy(V_sack_hole_zone);
 
 	/*
 	 * Cannot free the zone until all tcpcbs are released as we attach
 	 * the allocations to them.
 	 */
 	tcp_fastopen_destroy();
 
 	COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES);
 	VNET_PCPUSTAT_FREE(tcpstat);
 
 #ifdef TCP_HHOOK
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error);
 	}
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error);
 	}
 #endif
 }
 VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL);
 #endif
 
 void
 tcp_fini(void *xtp)
 {
 
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr)
 {
 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->inp_flow & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		if (port == 0)
 			ip6->ip6_nxt = IPPROTO_TCP;
 		else
 			ip6->ip6_nxt = IPPROTO_UDP;
 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		struct ip *ip;
 
 		ip = (struct ip *)ip_ptr;
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = 5;
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_len = 0;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_sum = 0;
 		if (port == 0)
 			ip->ip_p = IPPROTO_TCP;
 		else
 			ip->ip_p = IPPROTO_UDP;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst = inp->inp_faddr;
 	}
 #endif /* INET */
 	th->th_sport = inp->inp_lport;
 	th->th_dport = inp->inp_fport;
 	th->th_seq = 0;
 	th->th_ack = 0;
 	th->th_off = 5;
 	tcp_set_flags(th, 0);
 	th->th_win = 0;
 	th->th_urp = 0;
 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcpip_maketemplate(struct inpcb *inp)
 {
 	struct tcptemp *t;
 
 	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
 	if (t == NULL)
 		return (NULL);
 	tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t);
 	return (t);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == NULL, then we make a copy
  * of the tcpiphdr at th and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the segment th,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then th must point to *inside* the mbuf.
  */
 void
 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
     tcp_seq ack, tcp_seq seq, uint16_t flags)
 {
 	struct tcpopt to;
 	struct inpcb *inp;
 	struct ip *ip;
 	struct mbuf *optm;
 	struct udphdr *uh = NULL;
 	struct tcphdr *nth;
 	struct tcp_log_buffer *lgb;
 	u_char *optp;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int optlen, tlen, win, ulen;
 	int ect = 0;
 	bool incl_opts;
 	uint16_t port;
 	int output_ret;
 #ifdef INVARIANTS
 	int thflags = tcp_get_flags(th);
 #endif
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 	NET_EPOCH_ASSERT();
 
 #ifdef INET6
 	isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4);
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp != NULL) {
 		inp = tptoinpcb(tp);
 		INP_LOCK_ASSERT(inp);
 	} else
 		inp = NULL;
 
 	if (m != NULL) {
 #ifdef INET6
 		if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP))
 			port = m->m_pkthdr.tcp_tun_port;
 		else
 #endif
 		if (ip && (ip->ip_p == IPPROTO_UDP))
 			port = m->m_pkthdr.tcp_tun_port;
 		else
 			port = 0;
 	} else
 		port = tp->t_port;
 
 	incl_opts = false;
 	win = 0;
 	if (tp != NULL) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&inp->inp_socket->so_rcv);
 			if (win > TCP_MAXWIN << tp->rcv_scale)
 				win = TCP_MAXWIN << tp->rcv_scale;
 		}
 		if ((tp->t_flags & TF_NOOPT) == 0)
 			incl_opts = true;
 	}
 	if (m == NULL) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		} else
 #endif /* INET6 */
 		{
 			bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 			ip = mtod(m, struct ip *);
 			nth = (struct tcphdr *)(ip + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		}
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else if ((!M_WRITABLE(m)) || (port != 0)) {
 		struct mbuf *n;
 
 		/* Can't reuse 'm', allocate a new mbuf. */
 		n = m_gethdr(M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			m_freem(m);
 			return;
 		}
 
 		if (!m_dup_pkthdr(n, m, M_NOWAIT)) {
 			m_freem(m);
 			m_freem(n);
 			return;
 		}
 
 		n->m_data += max_linkhdr;
 		/* m_len is set later */
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(n, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(n, struct ip6_hdr *);
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		} else
 #endif /* INET6 */
 		{
 			bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip));
 			ip = mtod(n, struct ip *);
 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
 			nth = (struct tcphdr *)(ip + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		}
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		xchg(nth->th_dport, nth->th_sport, uint16_t);
 		th = nth;
 		m_freem(m);
 		m = n;
 	} else {
 		/*
 		 *  reuse the mbuf.
 		 * XXX MRT We inherit the FIB, which is lucky.
 		 */
 		m_freem(m->m_next);
 		m->m_next = NULL;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 		{
 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
 			nth = (struct tcphdr *)(ip + 1);
 		}
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, uint16_t);
 #undef xchg
 	}
 	tlen = 0;
 #ifdef INET6
 	if (isipv6)
 		tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		tlen = sizeof (struct tcpiphdr);
 #endif
 	if (port)
 		tlen += sizeof (struct udphdr);
 #ifdef INVARIANTS
 	m->m_len = 0;
 	KASSERT(M_TRAILINGSPACE(m) >= tlen,
 	    ("Not enough trailing space for message (m=%p, need=%d, have=%ld)",
 	    m, tlen, (long)M_TRAILINGSPACE(m)));
 #endif
 	m->m_len = tlen;
 	to.to_flags = 0;
 	if (incl_opts) {
 		ect = tcp_ecn_output_established(tp, &flags, 0, false);
 		/* Make sure we have room. */
 		if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) {
 			m->m_next = m_get(M_NOWAIT, MT_DATA);
 			if (m->m_next) {
 				optp = mtod(m->m_next, u_char *);
 				optm = m->m_next;
 			} else
 				incl_opts = false;
 		} else {
 			optp = (u_char *) (nth + 1);
 			optm = m;
 		}
 	}
 	if (incl_opts) {
 		/* Timestamps. */
 		if (tp->t_flags & TF_RCVD_TSTMP) {
 			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 		}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif
 		/* Add the options. */
 		tlen += optlen = tcp_addoptions(&to, optp);
 
 		/* Update m_len in the correct mbuf. */
 		optm->m_len += optlen;
 	} else
 		optlen = 0;
 #ifdef INET6
 	if (isipv6) {
 		if (uh) {
 			ulen = tlen - sizeof(struct ip6_hdr);
 			uh->uh_ulen = htons(ulen);
 		}
 		ip6->ip6_flow = htonl(ect << 20);
 		ip6->ip6_vfc = IPV6_VERSION;
 		if (port)
 			ip6->ip6_nxt = IPPROTO_UDP;
 		else
 			ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons(tlen - sizeof(*ip6));
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		if (uh) {
 			ulen = tlen - sizeof(struct ip);
 			uh->uh_ulen = htons(ulen);
 		}
 		ip->ip_tos = ect;
 		ip->ip_len = htons(tlen);
 		ip->ip_ttl = V_ip_defttl;
 		if (port) {
 			ip->ip_p = IPPROTO_UDP;
 		} else {
 			ip->ip_p = IPPROTO_TCP;
 		}
 		if (V_path_mtu_discovery)
 			ip->ip_off |= htons(IP_DF);
 	}
 #endif
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 	if (inp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		INP_LOCK_ASSERT(inp);
 		mac_inpcb_create_mbuf(inp, m);
 	} else {
 		/*
 		 * Packet is not associated with a socket, so possibly
 		 * update the label in place.
 		 */
 		mac_netinet_tcp_reply(m);
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 	tcp_set_flags(nth, flags);
 	if (tp != NULL)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (to.to_flags & TOF_SIGNATURE) {
 		if (!TCPMD5_ENABLED() ||
 		    TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) {
 			m_freem(m);
 			return;
 		}
 	}
 #endif
 
 #ifdef INET6
 	if (isipv6) {
 		if (port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
 			nth->th_sum = 0;
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			nth->th_sum = in6_cksum_pseudo(ip6,
 			    tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0);
 		}
 		ip6->ip6_hlim = in6_selecthlim(inp, NULL);
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		if (port) {
 			uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 			    htons(ulen + IPPROTO_UDP));
 			m->m_pkthdr.csum_flags = CSUM_UDP;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			nth->th_sum = 0;
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 			    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
 		}
 	}
 #endif /* INET */
 	TCP_PROBE3(debug__output, tp, th, m);
 	if (flags & TH_RST)
 		TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth);
 	lgb = NULL;
 	if ((tp != NULL) && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		if (INP_WLOCKED(inp)) {
 			union tcp_log_stackspecific log;
 			struct timeval tv;
 
 			memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 			log.u_bbr.inhpts = inp->inp_in_hpts;
 			log.u_bbr.flex8 = 4;
 			log.u_bbr.pkts_out = tp->t_maxseg;
 			log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 			log.u_bbr.delivered = 0;
 			lgb = tcp_log_event_(tp, nth, NULL, NULL, TCP_LOG_OUT,
 			    ERRNO_UNK, 0, &log, false, NULL, NULL, 0, &tv);
 		} else {
 			/*
 			 * We can not log the packet, since we only own the
 			 * read lock, but a write lock is needed. The read lock
 			 * is not upgraded to a write lock, since only getting
 			 * the read lock was done intentionally to improve the
 			 * handling of SYN flooding attacks.
 			 * This happens only for pure SYN segments received in
 			 * the initial CLOSED state, or received in a more
 			 * advanced state than listen and the UDP encapsulation
 			 * port is unexpected.
 			 * The incoming SYN segments do not really belong to
 			 * the TCP connection and the handling does not change
 			 * the state of the TCP connection. Therefore, the
 			 * sending of the RST segments is not logged. Please
 			 * note that also the incoming SYN segments are not
 			 * logged.
 			 *
 			 * The following code ensures that the above description
 			 * is and stays correct.
 			 */
 			KASSERT((thflags & (TH_ACK|TH_SYN)) == TH_SYN &&
 			    (tp->t_state == TCPS_CLOSED ||
 			    (tp->t_state > TCPS_LISTEN && tp->t_port != port)),
 			    ("%s: Logging of TCP segment with flags 0x%b and "
 			    "UDP encapsulation port %u skipped in state %s",
 			    __func__, thflags, PRINT_TH_FLAGS,
 			    ntohs(port), tcpstates[tp->t_state]));
 		}
 	}
 
 	if (flags & TH_ACK)
 		TCPSTAT_INC(tcps_sndacks);
 	else if (flags & (TH_SYN|TH_FIN|TH_RST))
 		TCPSTAT_INC(tcps_sndctrl);
 	TCPSTAT_INC(tcps_sndtotal);
 
 #ifdef INET6
 	if (isipv6) {
 		TCP_PROBE5(send, NULL, tp, ip6, tp, nth);
 		output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		TCP_PROBE5(send, NULL, tp, ip, tp, nth);
 		output_ret = ip_output(m, NULL, NULL, 0, NULL, inp);
 	}
 #endif
 	if (lgb != NULL)
 		lgb->tlb_errno = output_ret;
 }
 
 /*
  * Create a new TCP control block, making an empty reassembly queue and hooking
  * it to the argument protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up by tcpcbstor declaration.
  */
 struct tcpcb *
 tcp_newtcpcb(struct inpcb *inp)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	/*
 	 * Historically allocation was done with M_ZERO.  There is a lot of
 	 * code that rely on that.  For now take safe approach and zero whole
 	 * tcpcb.  This definitely can be optimized.
 	 */
 	bzero(&tp->t_start_zero, t_zero_size);
 
 	/* Initialise cc_var struct for this tcpcb. */
 	tp->t_ccv.type = IPPROTO_TCP;
 	tp->t_ccv.ccvc.tcp = tp;
 	rw_rlock(&tcp_function_lock);
 	tp->t_fb = tcp_func_set_ptr;
 	refcount_acquire(&tp->t_fb->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	/*
 	 * Use the current system default CC algorithm.
 	 */
 	cc_attach(tp, CC_DEFAULT_ALGO());
 
 	if (CC_ALGO(tp)->cb_init != NULL)
 		if (CC_ALGO(tp)->cb_init(&tp->t_ccv, NULL) > 0) {
 			cc_detach(tp);
 			if (tp->t_fb->tfb_tcp_fb_fini)
 				(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 			refcount_release(&tp->t_fb->tfb_refcnt);
 			return (NULL);
 		}
 
 #ifdef TCP_HHOOK
 	if (khelp_init_osd(HELPER_CLASS_TCP, &tp->t_osd)) {
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		return (NULL);
 	}
 #endif
 
 	TAILQ_INIT(&tp->t_segq);
 	tp->t_maxseg =
 #ifdef INET6
 		isipv6 ? V_tcp_v6mssdflt :
 #endif /* INET6 */
 		V_tcp_mssdflt;
 
 	callout_init_rw(&tp->t_callout, &inp->inp_lock, CALLOUT_RETURNUNLOCKED);
 	for (int i = 0; i < TT_N; i++)
 		tp->t_timers[i] = SBT_MAX;
 
 	switch (V_tcp_do_rfc1323) {
 		case 0:
 			break;
 		default:
 		case 1:
 			tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 			break;
 		case 2:
 			tp->t_flags = TF_REQ_SCALE;
 			break;
 		case 3:
 			tp->t_flags = TF_REQ_TSTMP;
 			break;
 	}
 	if (V_tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
 	TAILQ_INIT(&tp->snd_holes);
 
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = tcp_rexmit_initial;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 #ifdef TCPHPTS
 	/*
 	 * If using hpts lets drop a random number in so
 	 * not all new connections fall on the same CPU.
 	 */
 	inp->inp_hpts_cpu = hpts_random_cpu(inp);
 #endif
 #ifdef TCPPCAP
 	/*
 	 * Init the TCP PCAP queues.
 	 */
 	tcp_pcap_tcpcb_init(tp);
 #endif
 #ifdef TCP_BLACKBOX
 	/* Initialize the per-TCPCB log data. */
 	tcp_log_tcpcbinit(tp);
 #endif
 	tp->t_pacing_rate = -1;
 	if (tp->t_fb->tfb_tcp_fb_init) {
 		if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) {
 			refcount_release(&tp->t_fb->tfb_refcnt);
 			return (NULL);
 		}
 	}
 #ifdef STATS
 	if (V_tcp_perconn_stats_enable == 1)
 		tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0);
 #endif
 	if (V_tcp_do_lrd)
 		tp->t_flags |= TF_LRD;
 
 	return (tp);
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(struct tcpcb *tp, int errno)
 {
 	struct socket *so = tptosocket(tp);
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tcp_state_change(tp, TCPS_CLOSED);
 		/* Don't use tcp_output() here due to possible recursion. */
 		(void)tcp_output_nodrop(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 void
 tcp_discardcb(struct tcpcb *tp)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 	struct socket *so = tptosocket(tp);
 #ifdef INET6
 	bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 
 	tcp_timer_stop(tp);
 	if (tp->t_fb->tfb_tcp_timer_stop_all) {
 		tp->t_fb->tfb_tcp_timer_stop_all(tp);
 	}
 
 	/* free the reassembly queue, if any */
 	tcp_reass_flush(tp);
 
 #ifdef TCP_OFFLOAD
 	/* Disconnect offload device, if any. */
 	if (tp->t_flags & TF_TOE)
 		tcp_offload_detach(tp);
 #endif
 
 	tcp_free_sackholes(tp);
 
 #ifdef TCPPCAP
 	/* Free the TCP PCAP queues. */
 	tcp_pcap_drain(&(tp->t_inpkts));
 	tcp_pcap_drain(&(tp->t_outpkts));
 #endif
 
 	/* Allow the CC algorithm to clean up after itself. */
 	if (CC_ALGO(tp)->cb_destroy != NULL)
 		CC_ALGO(tp)->cb_destroy(&tp->t_ccv);
 	CC_DATA(tp) = NULL;
 	/* Detach from the CC algorithm */
 	cc_detach(tp);
 
 #ifdef TCP_HHOOK
 	khelp_destroy_osd(&tp->t_osd);
 #endif
 #ifdef STATS
 	stats_blob_destroy(tp->t_stats);
 #endif
 
 	CC_ALGO(tp) = NULL;
 
 #ifdef TCP_BLACKBOX
 	tcp_log_tcpcbfini(tp);
 #endif
 	TCPSTATES_DEC(tp->t_state);
 	if (tp->t_fb->tfb_tcp_fb_fini)
 		(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as 4 rtt samples.
 	 * 4 samples is enough for the srtt filter to converge
 	 * to within enough % of the correct value; fewer samples
 	 * and we could save a bogus rtt. The danger is not high
 	 * as tcp quickly recovers from everything.
 	 * XXX: Works very well but needs some more statistics!
 	 *
 	 * XXXRRS: Updating must be after the stack fini() since
 	 * that may be converting some internal representation of
 	 * say srtt etc into the general one used by other stacks.
 	 * Lets also at least protect against the so being NULL
 	 * as RW stated below.
 	 */
 	if ((tp->t_rttupdated >= 4) && (so != NULL)) {
 		struct hc_metrics_lite metrics;
 		uint32_t ssthresh;
 
 		bzero(&metrics, sizeof(metrics));
 		/*
 		 * Update the ssthresh always when the conditions below
 		 * are satisfied. This gives us better new start value
 		 * for the congestion avoidance for new connections.
 		 * ssthresh is only set if packet loss occurred on a session.
 		 *
 		 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
 		 * being torn down.  Ideally this code would not use 'so'.
 		 */
 		ssthresh = tp->snd_ssthresh;
 		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (ssthresh < 2)
 				ssthresh = 2;
 			ssthresh *= (tp->t_maxseg +
 #ifdef INET6
 			    (isipv6 ? sizeof (struct ip6_hdr) +
 			    sizeof (struct tcphdr) :
 #endif
 			    sizeof (struct tcpiphdr)
 #ifdef INET6
 			    )
 #endif
 			    );
 		} else
 			ssthresh = 0;
 		metrics.rmx_ssthresh = ssthresh;
 
 		metrics.rmx_rtt = tp->t_srtt;
 		metrics.rmx_rttvar = tp->t_rttvar;
 		metrics.rmx_cwnd = tp->snd_cwnd;
 		metrics.rmx_sendpipe = 0;
 		metrics.rmx_recvpipe = 0;
 
 		tcp_hc_update(&inp->inp_inc, &metrics);
 	}
 
 	refcount_release(&tp->t_fb->tfb_refcnt);
 }
 
 /*
  * Attempt to close a TCP control block, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 struct tcpcb *
 tcp_close(struct tcpcb *tp)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 	struct socket *so = tptosocket(tp);
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_state == TCPS_LISTEN)
 		tcp_offload_listen_stop(tp);
 #endif
 	/*
 	 * This releases the TFO pending counter resource for TFO listen
 	 * sockets as well as passively-created TFO sockets that transition
 	 * from SYN_RECEIVED to CLOSED.
 	 */
 	if (tp->t_tfo_pending) {
 		tcp_fastopen_decrement_counter(tp->t_tfo_pending);
 		tp->t_tfo_pending = NULL;
 	}
 #ifdef TCPHPTS
 	tcp_hpts_remove(inp);
 #endif
 	in_pcbdrop(inp);
 	TCPSTAT_INC(tcps_closed);
 	if (tp->t_state != TCPS_CLOSED)
 		tcp_state_change(tp, TCPS_CLOSED);
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
 	soisdisconnected(so);
 	if (inp->inp_flags & INP_SOCKREF) {
 		inp->inp_flags &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		sorele(so);
 		return (NULL);
 	}
 	return (tp);
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(struct inpcb *inp, int error)
 {
 	struct tcpcb *tp;
 
 	INP_WLOCK_ASSERT(inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN)) {
 		if (inp->inp_route.ro_nh) {
 			NH_FREE(inp->inp_route.ro_nh);
 			inp->inp_route.ro_nh = (struct nhop_object *)NULL;
 		}
 		return (inp);
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tp = tcp_drop(tp, error);
 		if (tp != NULL)
 			return (inp);
 		else
 			return (NULL);
 	} else {
 		tp->t_softerror = error;
 		return (inp);
 	}
 #if 0
 	wakeup( &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
 	    INPLOOKUP_RLOCKPCB);
 	struct xinpgen xig;
 	struct inpcb *inp;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (req->oldptr == NULL) {
 		int n;
 
 		n = V_tcbinfo.ipi_count +
 		    counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]);
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
 		return (error);
 
 	bzero(&xig, sizeof(xig));
 	xig.xig_len = sizeof xig;
 	xig.xig_count = V_tcbinfo.ipi_count +
 	    counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]);
 	xig.xig_gen = V_tcbinfo.ipi_gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	error = syncache_pcblist(req);
 	if (error)
 		return (error);
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		if (inp->inp_gencnt <= xig.xig_gen &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
 			struct xtcpcb xt;
 
 			tcp_inptoxtp(inp, &xt);
 			error = SYSCTL_OUT(req, &xt, sizeof xt);
 			if (error) {
 				INP_RUNLOCK(inp);
 				break;
 			} else
 				continue;
 		}
 	}
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		xig.xig_gen = V_tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_tcbinfo.ipi_count +
 		    counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
     NULL, 0, tcp_pcblist, "S,xtcpcb",
     "List of active TCP connections");
 
 #ifdef INET
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	NET_EPOCH_ENTER(et);
 	inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL);
 	NET_EPOCH_EXIT(et);
 	if (inp != NULL) {
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT,
     0, 0, tcp_getcred, "S,xucred",
     "Get the xucred of a TCP connection");
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct epoch_tracker et;
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error;
 #ifdef INET
 	int mapped = 0;
 #endif
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 #ifdef INET
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 #endif
 			return (EINVAL);
 	}
 
 	NET_EPOCH_ENTER(et);
 #ifdef INET
 	if (mapped == 1)
 		inp = in_pcblookup(&V_tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL);
 	else
 #endif
 		inp = in6_pcblookup(&V_tcbinfo,
 			&addrs[1].sin6_addr, addrs[1].sin6_port,
 			&addrs[0].sin6_addr, addrs[0].sin6_port,
 			INPLOOKUP_RLOCKPCB, NULL);
 	NET_EPOCH_EXIT(et);
 	if (inp != NULL) {
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT,
     0, 0, tcp6_getcred, "S,xucred",
     "Get the xucred of a TCP6 connection");
 #endif /* INET6 */
 
 #ifdef INET
 /* Path MTU to try next when a fragmentation-needed message is received. */
 static inline int
 tcp_next_pmtu(const struct icmp *icp, const struct ip *ip)
 {
 	int mtu = ntohs(icp->icmp_nextmtu);
 
 	/* If no alternative MTU was proposed, try the next smaller one. */
 	if (!mtu)
 		mtu = ip_next_mtu(ntohs(ip->ip_len), 1);
 	if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr))
 		mtu = V_tcp_minmss + sizeof(struct tcpiphdr);
 
 	return (mtu);
 }
 
 static void
 tcp_ctlinput_with_port(struct icmp *icp, uint16_t port)
 {
 	struct ip *ip;
 	struct tcphdr *th;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int);
 	struct in_conninfo inc;
 	tcp_seq icmp_tcp_seq;
 	int errno, mtu;
 
 	errno = icmp_errmap(icp);
 	switch (errno) {
 	case 0:
 		return;
 	case EMSGSIZE:
 		notify = tcp_mtudisc_notify;
 		break;
 	case ECONNREFUSED:
 		if (V_icmp_may_rst)
 			notify = tcp_drop_syn_sent;
 		else
 			notify = tcp_notify;
 		break;
 	case EHOSTUNREACH:
 		if (V_icmp_may_rst && icp->icmp_type == ICMP_TIMXCEED)
 			notify = tcp_drop_syn_sent;
 		else
 			notify = tcp_notify;
 		break;
 	default:
 		notify = tcp_notify;
 	}
 
 	ip = &icp->icmp_ip;
 	th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 	icmp_tcp_seq = th->th_seq;
 	inp = in_pcblookup(&V_tcbinfo, ip->ip_dst, th->th_dport, ip->ip_src,
 	    th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
 	if (inp != NULL)  {
 		tp = intotcpcb(inp);
 #ifdef TCP_OFFLOAD
 		if (tp->t_flags & TF_TOE && errno == EMSGSIZE) {
 			/*
 			 * MTU discovery for offloaded connections.  Let
 			 * the TOE driver verify seq# and process it.
 			 */
 			mtu = tcp_next_pmtu(icp, ip);
 			tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu);
 			goto out;
 		}
 #endif
 		if (tp->t_port != port)
 			goto out;
 		if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) &&
 		    SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) {
 			if (errno == EMSGSIZE) {
 				/*
 				 * MTU discovery: we got a needfrag and
 				 * will potentially try a lower MTU.
 				 */
 				mtu = tcp_next_pmtu(icp, ip);
 
 				/*
 				 * Only process the offered MTU if it
 				 * is smaller than the current one.
 				 */
 				if (mtu < tp->t_maxseg +
 				    sizeof(struct tcpiphdr)) {
 					bzero(&inc, sizeof(inc));
 					inc.inc_faddr = ip->ip_dst;
 					inc.inc_fibnum =
 					    inp->inp_inc.inc_fibnum;
 					tcp_hc_updatemtu(&inc, mtu);
 					inp = tcp_mtudisc(inp, mtu);
 				}
 			} else
 				inp = (*notify)(inp, errno);
 		}
 	} else {
 		bzero(&inc, sizeof(inc));
 		inc.inc_fport = th->th_dport;
 		inc.inc_lport = th->th_sport;
 		inc.inc_faddr = ip->ip_dst;
 		inc.inc_laddr = ip->ip_src;
 		syncache_unreach(&inc, icmp_tcp_seq, port);
 	}
 out:
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 }
 
 static void
 tcp_ctlinput(struct icmp *icmp)
 {
 	tcp_ctlinput_with_port(icmp, htons(0));
 }
 
 static void
 tcp_ctlinput_viaudp(udp_tun_icmp_param_t param)
 {
 	/* Its a tunneled TCP over UDP icmp */
 	struct icmp *icmp = param.icmp;
 	struct ip *outer_ip, *inner_ip;
 	struct udphdr *udp;
 	struct tcphdr *th, ttemp;
 	int i_hlen, o_len;
 	uint16_t port;
 
 	outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip));
 	inner_ip = &icmp->icmp_ip;
 	i_hlen = inner_ip->ip_hl << 2;
 	o_len = ntohs(outer_ip->ip_len);
 	if (o_len <
 	    (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) {
 		/* Not enough data present */
 		return;
 	}
 	/* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */
 	udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen);
 	if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) {
 		return;
 	}
 	port = udp->uh_dport;
 	th = (struct tcphdr *)(udp + 1);
 	memcpy(&ttemp, th, sizeof(struct tcphdr));
 	memcpy(udp, &ttemp, sizeof(struct tcphdr));
 	/* Now adjust down the size of the outer IP header */
 	o_len -= sizeof(struct udphdr);
 	outer_ip->ip_len = htons(o_len);
 	/* Now call in to the normal handling code */
 	tcp_ctlinput_with_port(icmp, port);
 }
 #endif /* INET */
 
 #ifdef INET6
 static inline int
 tcp6_next_pmtu(const struct icmp6_hdr *icmp6)
 {
 	int mtu = ntohl(icmp6->icmp6_mtu);
 
 	/*
 	 * If no alternative MTU was proposed, or the proposed MTU was too
 	 * small, set to the min.
 	 */
 	if (mtu < IPV6_MMTU)
 		mtu = IPV6_MMTU - 8;	/* XXXNP: what is the adjustment for? */
 	return (mtu);
 }
 
 static void
 tcp6_ctlinput_with_port(struct ip6ctlparam *ip6cp, uint16_t port)
 {
 	struct in6_addr *dst;
 	struct inpcb *(*notify)(struct inpcb *, int);
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct icmp6_hdr *icmp6;
 	struct in_conninfo inc;
 	struct tcp_ports {
 		uint16_t th_sport;
 		uint16_t th_dport;
 	} t_ports;
 	tcp_seq icmp_tcp_seq;
 	unsigned int mtu;
 	unsigned int off;
 	int errno;
 
 	icmp6 = ip6cp->ip6c_icmp6;
 	m = ip6cp->ip6c_m;
 	ip6 = ip6cp->ip6c_ip6;
 	off = ip6cp->ip6c_off;
 	dst = &ip6cp->ip6c_finaldst->sin6_addr;
 
 	errno = icmp6_errmap(icmp6);
 	switch (errno) {
 	case 0:
 		return;
 	case EMSGSIZE:
 		notify = tcp_mtudisc_notify;
 		break;
 	case ECONNREFUSED:
 		if (V_icmp_may_rst)
 			notify = tcp_drop_syn_sent;
 		else
 			notify = tcp_notify;
 		break;
 	case EHOSTUNREACH:
 		/*
 		 * There are only four ICMPs that may reset connection:
 		 * - administratively prohibited
 		 * - port unreachable
 		 * - time exceeded in transit
 		 * - unknown next header
 		 */
 		if (V_icmp_may_rst &&
 		    ((icmp6->icmp6_type == ICMP6_DST_UNREACH &&
 		     (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN ||
 		      icmp6->icmp6_code == ICMP6_DST_UNREACH_NOPORT)) ||
 		    (icmp6->icmp6_type == ICMP6_TIME_EXCEEDED &&
 		      icmp6->icmp6_code == ICMP6_TIME_EXCEED_TRANSIT) ||
 		    (icmp6->icmp6_type == ICMP6_PARAM_PROB &&
 		      icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER)))
 			notify = tcp_drop_syn_sent;
 		else
 			notify = tcp_notify;
 		break;
 	default:
 		notify = tcp_notify;
 	}
 
 	/* Check if we can safely get the ports from the tcp hdr */
 	if (m == NULL ||
 	    (m->m_pkthdr.len <
 		(int32_t) (off + sizeof(struct tcp_ports)))) {
 		return;
 	}
 	bzero(&t_ports, sizeof(struct tcp_ports));
 	m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports);
 	inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport,
 	    &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL);
 	off += sizeof(struct tcp_ports);
 	if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) {
 		goto out;
 	}
 	m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq);
 	if (inp != NULL)  {
 		tp = intotcpcb(inp);
 #ifdef TCP_OFFLOAD
 		if (tp->t_flags & TF_TOE && errno == EMSGSIZE) {
 			/* MTU discovery for offloaded connections. */
 			mtu = tcp6_next_pmtu(icmp6);
 			tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu);
 			goto out;
 		}
 #endif
 		if (tp->t_port != port)
 			goto out;
 		if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) &&
 		    SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) {
 			if (errno == EMSGSIZE) {
 				/*
 				 * MTU discovery:
 				 * If we got a needfrag set the MTU
 				 * in the route to the suggested new
 				 * value (if given) and then notify.
 				 */
 				mtu = tcp6_next_pmtu(icmp6);
 
 				bzero(&inc, sizeof(inc));
 				inc.inc_fibnum = M_GETFIB(m);
 				inc.inc_flags |= INC_ISIPV6;
 				inc.inc6_faddr = *dst;
 				if (in6_setscope(&inc.inc6_faddr,
 					m->m_pkthdr.rcvif, NULL))
 					goto out;
 				/*
 				 * Only process the offered MTU if it
 				 * is smaller than the current one.
 				 */
 				if (mtu < tp->t_maxseg +
 				    sizeof (struct tcphdr) +
 				    sizeof (struct ip6_hdr)) {
 					tcp_hc_updatemtu(&inc, mtu);
 					tcp_mtudisc(inp, mtu);
 					ICMP6STAT_INC(icp6s_pmtuchg);
 				}
 			} else
 				inp = (*notify)(inp, errno);
 		}
 	} else {
 		bzero(&inc, sizeof(inc));
 		inc.inc_fibnum = M_GETFIB(m);
 		inc.inc_flags |= INC_ISIPV6;
 		inc.inc_fport = t_ports.th_dport;
 		inc.inc_lport = t_ports.th_sport;
 		inc.inc6_faddr = *dst;
 		inc.inc6_laddr = ip6->ip6_src;
 		syncache_unreach(&inc, icmp_tcp_seq, port);
 	}
 out:
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 }
 
 static void
 tcp6_ctlinput(struct ip6ctlparam *ctl)
 {
 	tcp6_ctlinput_with_port(ctl, htons(0));
 }
 
 static void
 tcp6_ctlinput_viaudp(udp_tun_icmp_param_t param)
 {
 	struct ip6ctlparam *ip6cp = param.ip6cp;
 	struct mbuf *m;
 	struct udphdr *udp;
 	uint16_t port;
 
 	m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL);
 	if (m == NULL) {
 		return;
 	}
 	udp = mtod(m, struct udphdr *);
 	if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) {
 		return;
 	}
 	port = udp->uh_dport;
 	m_adj(m, sizeof(struct udphdr));
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr);
 	}
 	/* Now call in to the normal handling code */
 	tcp6_ctlinput_with_port(ip6cp, port);
 }
 
 #endif /* INET6 */
 
 static uint32_t
 tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len)
 {
 	SIPHASH_CTX ctx;
 	uint32_t hash[2];
 
 	KASSERT(len >= SIPHASH_KEY_LENGTH,
 	    ("%s: keylen %u too short ", __func__, len));
 	SipHash24_Init(&ctx);
 	SipHash_SetKey(&ctx, (uint8_t *)key);
 	SipHash_Update(&ctx, &inc->inc_fport, sizeof(uint16_t));
 	SipHash_Update(&ctx, &inc->inc_lport, sizeof(uint16_t));
 	switch (inc->inc_flags & INC_ISIPV6) {
 #ifdef INET
 	case 0:
 		SipHash_Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr));
 		SipHash_Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr));
 		break;
 #endif
 #ifdef INET6
 	case INC_ISIPV6:
 		SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr));
 		SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr));
 		break;
 #endif
 	}
 	SipHash_Final((uint8_t *)hash, &ctx);
 
 	return (hash[0] ^ hash[1]);
 }
 
 uint32_t
 tcp_new_ts_offset(struct in_conninfo *inc)
 {
 	struct in_conninfo inc_store, *local_inc;
 
 	if (!V_tcp_ts_offset_per_conn) {
 		memcpy(&inc_store, inc, sizeof(struct in_conninfo));
 		inc_store.inc_lport = 0;
 		inc_store.inc_fport = 0;
 		local_inc = &inc_store;
 	} else {
 		local_inc = inc;
 	}
 	return (tcp_keyed_hash(local_inc, V_ts_offset_secret,
 	    sizeof(V_ts_offset_secret)));
 }
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used, with only small modifications.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * As reading the *exact* system time is too expensive to be done
  * whenever setting up a TCP connection, we increment the time
  * offset in two ways.  First, a small random positive increment
  * is added to isn_offset for each connection that is set up.
  * Second, the function tcp_isn_tick fires once per clock tick
  * and increments isn_offset as necessary so that sequence numbers
  * are incremented at approximately ISN_BYTES_PER_SECOND.  The
  * random positive increments serve only to ensure that the same
  * exact sequence number is never sent out twice (as could otherwise
  * happen when a port is recycled in less than the system tick
  * interval.)
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
  * isn_offset_old, and isn_ctx is performed using the ISN lock.  In
  * general, this means holding an exclusive (write) lock.
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 #define ISN_STATIC_INCREMENT 4096
 #define ISN_RANDOM_INCREMENT (4096 - 1)
 #define ISN_SECRET_LENGTH    SIPHASH_KEY_LENGTH
 
 VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]);
 VNET_DEFINE_STATIC(int, isn_last);
 VNET_DEFINE_STATIC(int, isn_last_reseed);
 VNET_DEFINE_STATIC(u_int32_t, isn_offset);
 VNET_DEFINE_STATIC(u_int32_t, isn_offset_old);
 
 #define	V_isn_secret			VNET(isn_secret)
 #define	V_isn_last			VNET(isn_last)
 #define	V_isn_last_reseed		VNET(isn_last_reseed)
 #define	V_isn_offset			VNET(isn_offset)
 #define	V_isn_offset_old		VNET(isn_offset_old)
 
 tcp_seq
 tcp_new_isn(struct in_conninfo *inc)
 {
 	tcp_seq new_isn;
 	u_int32_t projected_offset;
 
 	ISN_LOCK();
 	/* Seed if this is the first use, reseed if requested. */
 	if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
 	     (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0);
 		V_isn_last_reseed = ticks;
 	}
 
 	/* Compute the hash and return the ISN. */
 	new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret,
 	    sizeof(V_isn_secret));
 	V_isn_offset += ISN_STATIC_INCREMENT +
 		(arc4random() & ISN_RANDOM_INCREMENT);
 	if (ticks != V_isn_last) {
 		projected_offset = V_isn_offset_old +
 		    ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last);
 		if (SEQ_GT(projected_offset, V_isn_offset))
 			V_isn_offset = projected_offset;
 		V_isn_offset_old = V_isn_offset;
 		V_isn_last = ticks;
 	}
 	new_isn += V_isn_offset;
 	ISN_UNLOCK();
 	return (new_isn);
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 static struct inpcb *
 tcp_drop_syn_sent(struct inpcb *inp, int errno)
 {
 	struct tcpcb *tp;
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(inp);
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_SYN_SENT)
 		return (inp);
 
 	if (IS_FASTOPEN(tp->t_flags))
 		tcp_fastopen_disable_path(tp);
 
 	tp = tcp_drop(tp, errno);
 	if (tp != NULL)
 		return (inp);
 	else
 		return (NULL);
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value. Also nudge TCP to send something, since we
  * know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 static struct inpcb *
 tcp_mtudisc_notify(struct inpcb *inp, int error)
 {
 
 	return (tcp_mtudisc(inp, -1));
 }
 
 static struct inpcb *
 tcp_mtudisc(struct inpcb *inp, int mtuoffer)
 {
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
 
 	tcp_mss_update(tp, -1, mtuoffer, NULL, NULL);
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	/* If the mss is larger than the socket buffer, decrease the mss. */
 	if (so->so_snd.sb_hiwat < tp->t_maxseg)
 		tp->t_maxseg = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	TCPSTAT_INC(tcps_mturesent);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = tp->snd_una;
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp->t_flags);
 	if (tp->t_fb->tfb_tcp_mtu_chg != NULL) {
 		/*
 		 * Conceptually the snd_nxt setting
 		 * and freeing sack holes should
 		 * be done by the default stacks
 		 * own tfb_tcp_mtu_chg().
 		 */
 		tp->t_fb->tfb_tcp_mtu_chg(tp);
 	}
 	if (tcp_output(tp) < 0)
 		return (NULL);
 	else
 		return (inp);
 }
 
 #ifdef INET
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated, then return 0.  This routine
  * is called by TCP routines that access the rmx structure and by
  * tcp_mss_update to get the peer/interface MTU.
  */
 uint32_t
 tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct nhop_object *nh;
 	struct ifnet *ifp;
 	uint32_t maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
 
 	if (inc->inc_faddr.s_addr != INADDR_ANY) {
 		nh = fib4_lookup(inc->inc_fibnum, inc->inc_faddr, 0, NHR_NONE, 0);
 		if (nh == NULL)
 			return (0);
 
 		ifp = nh->nh_ifp;
 		maxmtu = nh->nh_mtu;
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 	}
 	return (maxmtu);
 }
 #endif /* INET */
 
 #ifdef INET6
 uint32_t
 tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct nhop_object *nh;
 	struct in6_addr dst6;
 	uint32_t scopeid;
 	struct ifnet *ifp;
 	uint32_t maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
 
 	if (inc->inc_flags & INC_IPV6MINMTU)
 		return (IPV6_MMTU);
 
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 		in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid);
 		nh = fib6_lookup(inc->inc_fibnum, &dst6, scopeid, NHR_NONE, 0);
 		if (nh == NULL)
 			return (0);
 
 		ifp = nh->nh_ifp;
 		maxmtu = nh->nh_mtu;
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 	}
 
 	return (maxmtu);
 }
 
 /*
  * Handle setsockopt(IPV6_USE_MIN_MTU) by a TCP stack.
  *
  * XXXGL: we are updating inpcb here with INC_IPV6MINMTU flag.
  * The right place to do that is ip6_setpktopt() that has just been
  * executed.  By the way it just filled ip6po_minmtu for us.
  */
 void
 tcp6_use_min_mtu(struct tcpcb *tp)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 
 	INP_WLOCK_ASSERT(inp);
 	/*
 	 * In case of the IPV6_USE_MIN_MTU socket
 	 * option, the INC_IPV6MINMTU flag to announce
 	 * a corresponding MSS during the initial
 	 * handshake.  If the TCP connection is not in
 	 * the front states, just reduce the MSS being
 	 * used.  This avoids the sending of TCP
 	 * segments which will be fragmented at the
 	 * IPv6 layer.
 	 */
 	inp->inp_inc.inc_flags |= INC_IPV6MINMTU;
 	if ((tp->t_state >= TCPS_SYN_SENT) &&
 	    (inp->inp_inc.inc_flags & INC_ISIPV6)) {
 		struct ip6_pktopts *opt;
 
 		opt = inp->in6p_outputopts;
 		if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL &&
 		    tp->t_maxseg > TCP6_MSS)
 			tp->t_maxseg = TCP6_MSS;
 	}
 }
 #endif /* INET6 */
 
 /*
  * Calculate effective SMSS per RFC5681 definition for a given TCP
  * connection at its current state, taking into account SACK and etc.
  */
 u_int
 tcp_maxseg(const struct tcpcb *tp)
 {
 	u_int optlen;
 
 	if (tp->t_flags & TF_NOOPT)
 		return (tp->t_maxseg);
 
 	/*
 	 * Here we have a simplified code from tcp_addoptions(),
 	 * without a proper loop, and having most of paddings hardcoded.
 	 * We might make mistakes with padding here in some edge cases,
 	 * but this is harmless, since result of tcp_maxseg() is used
 	 * only in cwnd and ssthresh estimations.
 	 */
 	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 		if (tp->t_flags & TF_RCVD_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = 0;
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PADTCPOLEN(TCPOLEN_SIGNATURE);
 #endif
 		if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) {
 			optlen += TCPOLEN_SACKHDR;
 			optlen += tp->rcv_numsacks * TCPOLEN_SACK;
 			optlen = PADTCPOLEN(optlen);
 		}
 	} else {
 		if (tp->t_flags & TF_REQ_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = PADTCPOLEN(TCPOLEN_MAXSEG);
 		if (tp->t_flags & TF_REQ_SCALE)
 			optlen += PADTCPOLEN(TCPOLEN_WINDOW);
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PADTCPOLEN(TCPOLEN_SIGNATURE);
 #endif
 		if (tp->t_flags & TF_SACK_PERMIT)
 			optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED);
 	}
 #undef PAD
 	optlen = min(optlen, TCP_MAXOLEN);
 	return (tp->t_maxseg - optlen);
 }
 
 
 u_int
 tcp_fixed_maxseg(const struct tcpcb *tp)
 {
 	int optlen;
 
 	if (tp->t_flags & TF_NOOPT)
 		return (tp->t_maxseg);
 
 	/*
 	 * Here we have a simplified code from tcp_addoptions(),
 	 * without a proper loop, and having most of paddings hardcoded.
 	 * We only consider fixed options that we would send every
 	 * time I.e. SACK is not considered. This is important
 	 * for cc modules to figure out what the modulo of the
 	 * cwnd should be.
 	 */
 #define	PAD(len)	((((len) / 4) + !!((len) % 4)) * 4)
 	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 		if (tp->t_flags & TF_RCVD_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = 0;
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PAD(TCPOLEN_SIGNATURE);
 #endif
 	} else {
 		if (tp->t_flags & TF_REQ_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = PAD(TCPOLEN_MAXSEG);
 		if (tp->t_flags & TF_REQ_SCALE)
 			optlen += PAD(TCPOLEN_WINDOW);
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PAD(TCPOLEN_SIGNATURE);
 #endif
 		if (tp->t_flags & TF_SACK_PERMIT)
 			optlen += PAD(TCPOLEN_SACK_PERMITTED);
 	}
 #undef PAD
 	optlen = min(optlen, TCP_MAXOLEN);
 	return (tp->t_maxseg - optlen);
 }
 
 
 
 static int
 sysctl_drop(SYSCTL_HANDLER_ARGS)
 {
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 	struct tcpcb *tp;
 #ifdef INET
 	struct sockaddr_in *fin = NULL, *lin = NULL;
 #endif
 	struct epoch_tracker et;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 #endif
 	int error;
 
 	inp = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 #ifdef INET
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 #endif
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 #endif
 	default:
 		return (EINVAL);
 	}
 	NET_EPOCH_ENTER(et);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr,
 		    fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port,
 		    INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port,
 		    lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 	}
 	if (inp != NULL) {
 		if (!SOLISTENING(inp->inp_socket)) {
 			tp = intotcpcb(inp);
 			tp = tcp_drop(tp, ECONNABORTED);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 		} else
 			INP_WUNLOCK(inp);
 	} else
 		error = ESRCH;
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_NEEDGIANT, NULL, 0, sysctl_drop, "",
     "Drop TCP connection");
 
 static int
 tcp_sysctl_setsockopt(SYSCTL_HANDLER_ARGS)
 {
 	return (sysctl_setsockopt(oidp, arg1, arg2, req, &V_tcbinfo,
 	    &tcp_ctloutput_set));
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, setsockopt,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, NULL, 0, tcp_sysctl_setsockopt, "",
     "Set socket option for TCP endpoint");
 
 #ifdef KERN_TLS
 static int
 sysctl_switch_tls(SYSCTL_HANDLER_ARGS)
 {
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 #ifdef INET
 	struct sockaddr_in *fin = NULL, *lin = NULL;
 #endif
 	struct epoch_tracker et;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 #endif
 	int error;
 
 	inp = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 #ifdef INET
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 #endif
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 #endif
 	default:
 		return (EINVAL);
 	}
 	NET_EPOCH_ENTER(et);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr,
 		    fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port,
 		    INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port,
 		    lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 	}
 	NET_EPOCH_EXIT(et);
 	if (inp != NULL) {
 		struct socket *so;
 
 		so = inp->inp_socket;
 		soref(so);
 		error = ktls_set_tx_mode(so,
 		    arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET);
 		INP_WUNLOCK(inp);
 		sorele(so);
 	} else
 		error = ESRCH;
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_NEEDGIANT, NULL, 0, sysctl_switch_tls, "",
     "Switch TCP connection to SW TLS");
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_NEEDGIANT, NULL, 1, sysctl_switch_tls, "",
     "Switch TCP connection to ifnet TLS");
 #endif
 
 /*
  * Generate a standardized TCP log line for use throughout the
  * tcp subsystem.  Memory allocation is done with M_NOWAIT to
  * allow use in the interrupt context.
  *
  * NB: The caller MUST free(s, M_TCPLOG) the returned string.
  * NB: The function may return NULL if memory allocation failed.
  *
  * Due to header inclusion and ordering limitations the struct ip
  * and ip6_hdr pointers have to be passed as void pointers.
  */
 char *
 tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (V_tcp_log_in_vain == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 char *
 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (tcp_log_debug == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 static char *
 tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr,
     const void *ip6hdr)
 {
 	char *s, *sp;
 	size_t size;
 #ifdef INET
 	const struct ip *ip = (const struct ip *)ip4hdr;
 #endif
 #ifdef INET6
 	const struct ip6_hdr *ip6 = (const struct ip6_hdr *)ip6hdr;
 #endif /* INET6 */
 
 	/*
 	 * The log line looks like this:
 	 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
 	 */
 	size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
 	    sizeof(PRINT_TH_FLAGS) + 1 +
 #ifdef INET6
 	    2 * INET6_ADDRSTRLEN;
 #else
 	    2 * INET_ADDRSTRLEN;
 #endif /* INET6 */
 
 	s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
 	if (s == NULL)
 		return (NULL);
 
 	strcat(s, "TCP: [");
 	sp = s + strlen(s);
 
 	if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) {
 		inet_ntoa_r(inc->inc_faddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		inet_ntoa_r(inc->inc_laddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 #ifdef INET6
 	} else if (inc) {
 		ip6_sprintf(sp, &inc->inc6_faddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &inc->inc6_laddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 	} else if (ip6 && th) {
 		ip6_sprintf(sp, &ip6->ip6_src);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &ip6->ip6_dst);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET6 */
 #ifdef INET
 	} else if (ip && th) {
 		inet_ntoa_r(ip->ip_src, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		inet_ntoa_r(ip->ip_dst, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET */
 	} else {
 		free(s, M_TCPLOG);
 		return (NULL);
 	}
 	sp = s + strlen(s);
 	if (th)
 		sprintf(sp, " tcpflags 0x%b", tcp_get_flags(th), PRINT_TH_FLAGS);
 	if (*(s + size - 1) != '\0')
 		panic("%s: string too long", __func__);
 	return (s);
 }
 
 /*
  * A subroutine which makes it easy to track TCP state changes with DTrace.
  * This function shouldn't be called for t_state initializations that don't
  * correspond to actual TCP state transitions.
  */
 void
 tcp_state_change(struct tcpcb *tp, int newstate)
 {
 #if defined(KDTRACE_HOOKS)
 	int pstate = tp->t_state;
 #endif
 
 	TCPSTATES_DEC(tp->t_state);
 	TCPSTATES_INC(newstate);
 	tp->t_state = newstate;
 	TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate);
 }
 
 /*
  * Create an external-format (``xtcpcb'') structure using the information in
  * the kernel-format tcpcb structure pointed to by tp.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
  * user code from changes in the kernel structure, and potentially to provide
  * information-hiding if we decide that some of this information should be
  * hidden from users.
  */
 void
 tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	sbintime_t now;
 
 	bzero(xt, sizeof(*xt));
 	xt->t_state = tp->t_state;
 	xt->t_logstate = tp->t_logstate;
 	xt->t_flags = tp->t_flags;
 	xt->t_sndzerowin = tp->t_sndzerowin;
 	xt->t_sndrexmitpack = tp->t_sndrexmitpack;
 	xt->t_rcvoopack = tp->t_rcvoopack;
 	xt->t_rcv_wnd = tp->rcv_wnd;
 	xt->t_snd_wnd = tp->snd_wnd;
 	xt->t_snd_cwnd = tp->snd_cwnd;
 	xt->t_snd_ssthresh = tp->snd_ssthresh;
 	xt->t_dsack_bytes = tp->t_dsack_bytes;
 	xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes;
 	xt->t_dsack_pack = tp->t_dsack_pack;
 	xt->t_maxseg = tp->t_maxseg;
 	xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 +
 		     (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0;
 
 	now = getsbinuptime();
 #define	COPYTIMER(which,where)	do {					\
 	if (tp->t_timers[which] != SBT_MAX)				\
 		xt->where = (tp->t_timers[which] - now) / SBT_1MS;	\
 	else								\
 		xt->where = 0;						\
 } while (0)
 	COPYTIMER(TT_DELACK, tt_delack);
 	COPYTIMER(TT_REXMT, tt_rexmt);
 	COPYTIMER(TT_PERSIST, tt_persist);
 	COPYTIMER(TT_KEEP, tt_keep);
 	COPYTIMER(TT_2MSL, tt_2msl);
 #undef COPYTIMER
 	xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz;
 
 	xt->xt_encaps_port = tp->t_port;
 	bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack,
 	    TCP_FUNCTION_NAME_LEN_MAX);
 	bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX);
 #ifdef TCP_BLACKBOX
 	(void)tcp_log_get_id(tp, xt->xt_logid);
 #endif
 
 	xt->xt_len = sizeof(struct xtcpcb);
 	in_pcbtoxinpcb(inp, &xt->xt_inp);
 }
 
 void
 tcp_log_end_status(struct tcpcb *tp, uint8_t status)
 {
 	uint32_t bit, i;
 
 	if ((tp == NULL) ||
 	    (status > TCP_EI_STATUS_MAX_VALUE) ||
 	    (status == 0)) {
 		/* Invalid */
 		return;
 	}
 	if (status > (sizeof(uint32_t) * 8)) {
 		/* Should this be a KASSERT? */
 		return;
 	}
 	bit = 1U << (status - 1);
 	if (bit & tp->t_end_info_status) {
 		/* already logged */
 		return;
 	}
 	for (i = 0; i < TCP_END_BYTE_INFO; i++) {
 		if (tp->t_end_info_bytes[i] == TCP_EI_EMPTY_SLOT) {
 			tp->t_end_info_bytes[i] = status;
 			tp->t_end_info_status |= bit;
 			break;
 		}
 	}
 }
 
 int
 tcp_can_enable_pacing(void)
 {
 
 	if ((tcp_pacing_limit == -1) ||
 	    (tcp_pacing_limit > number_of_tcp_connections_pacing)) {
 		atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1);
 		shadow_num_connections = number_of_tcp_connections_pacing;
 		return (1);
 	} else {
 		return (0);
 	}
 }
 
 static uint8_t tcp_pacing_warning = 0;
 
 void
 tcp_decrement_paced_conn(void)
 {
 	uint32_t ret;
 
 	ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1);
 	shadow_num_connections = number_of_tcp_connections_pacing;
 	KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?"));
 	if (ret == 0) {
 		if (tcp_pacing_limit != -1) {
 			printf("Warning all pacing is now disabled, count decrements invalidly!\n");
 			tcp_pacing_limit = 0;
 		} else if (tcp_pacing_warning == 0) {
 			printf("Warning pacing count is invalid, invalid decrement\n");
 			tcp_pacing_warning = 1;
 		}
 	}
 }
diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c
index 39423a425b36..c6af4a00c6c3 100644
--- a/sys/netinet/toecore.c
+++ b/sys/netinet/toecore.c
@@ -1,605 +1,606 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/types.h>
 #include <sys/sockopt.h>
 #include <sys/sysctl.h>
 #include <sys/socket.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/route.h>
 
 #include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/nd6.h>
 #define TCPSTATES
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_offload.h>
 #include <netinet/toecore.h>
 
 static struct mtx toedev_lock;
 static TAILQ_HEAD(, toedev) toedev_list;
 static eventhandler_tag listen_start_eh;
 static eventhandler_tag listen_stop_eh;
 static eventhandler_tag lle_event_eh;
 
 static int
 toedev_connect(struct toedev *tod __unused, struct socket *so __unused,
     struct nhop_object *nh __unused, struct sockaddr *nam __unused)
 {
 
 	return (ENOTSUP);
 }
 
 static int
 toedev_listen_start(struct toedev *tod __unused, struct tcpcb *tp __unused)
 {
 
 	return (ENOTSUP);
 }
 
 static int
 toedev_listen_stop(struct toedev *tod __unused, struct tcpcb *tp __unused)
 {
 
 	return (ENOTSUP);
 }
 
 static void
 toedev_input(struct toedev *tod __unused, struct tcpcb *tp __unused,
     struct mbuf *m)
 {
 
 	m_freem(m);
 	return;
 }
 
 static void
 toedev_rcvd(struct toedev *tod __unused, struct tcpcb *tp __unused)
 {
 
 	return;
 }
 
 static int
 toedev_output(struct toedev *tod __unused, struct tcpcb *tp __unused)
 {
 
 	return (ENOTSUP);
 }
 
 static void
 toedev_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp __unused)
 {
 
 	return;
 }
 
 static void
 toedev_l2_update(struct toedev *tod __unused, struct ifnet *ifp __unused,
     struct sockaddr *sa __unused, uint8_t *lladdr __unused,
     uint16_t vtag __unused)
 {
 
 	return;
 }
 
 static void
 toedev_route_redirect(struct toedev *tod __unused, struct ifnet *ifp __unused,
     struct nhop_object *nh0 __unused, struct nhop_object *nh1 __unused)
 {
 
 	return;
 }
 
 static void
 toedev_syncache_added(struct toedev *tod __unused, void *ctx __unused)
 {
 
 	return;
 }
 
 static void
 toedev_syncache_removed(struct toedev *tod __unused, void *ctx __unused)
 {
 
 	return;
 }
 
 static int
 toedev_syncache_respond(struct toedev *tod __unused, void *ctx __unused,
     struct mbuf *m)
 {
 
 	m_freem(m);
 	return (0);
 }
 
 static void
 toedev_offload_socket(struct toedev *tod __unused, void *ctx __unused,
     struct socket *so __unused)
 {
 
 	return;
 }
 
 static void
 toedev_ctloutput(struct toedev *tod __unused, struct tcpcb *tp __unused,
     int sopt_dir __unused, int sopt_name __unused)
 {
 
 	return;
 }
 
 static void
 toedev_tcp_info(struct toedev *tod __unused, struct tcpcb *tp __unused,
     struct tcp_info *ti __unused)
 {
 
 	return;
 }
 
 static int
 toedev_alloc_tls_session(struct toedev *tod __unused, struct tcpcb *tp __unused,
     struct ktls_session *tls __unused, int direction __unused)
 {
 
 	return (EINVAL);
 }
 
 static void
 toedev_pmtu_update(struct toedev *tod __unused, struct tcpcb *tp __unused,
     tcp_seq seq __unused, int mtu __unused)
 {
 
 	return;
 }
 
 /*
  * Inform one or more TOE devices about a listening socket.
  */
 static void
 toe_listen_start(struct inpcb *inp, void *arg)
 {
 	struct toedev *t, *tod;
 	struct tcpcb *tp;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(inp->inp_pcbinfo == &V_tcbinfo,
 	    ("%s: inp is not a TCP inp", __func__));
 
 	if (inp->inp_flags & INP_DROPPED)
 		return;
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_LISTEN)
 		return;
 
 	t = arg;
 	mtx_lock(&toedev_lock);
 	TAILQ_FOREACH(tod, &toedev_list, link) {
 		if (t == NULL || t == tod)
 			tod->tod_listen_start(tod, tp);
 	}
 	mtx_unlock(&toedev_lock);
 }
 
 static void
 toe_listen_start_event(void *arg __unused, struct tcpcb *tp)
 {
 	struct inpcb *inp = tptoinpcb(tp);
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state == TCPS_LISTEN,
 	    ("%s: t_state %s", __func__, tcpstates[tp->t_state]));
 
 	toe_listen_start(inp, NULL);
 }
 
 static void
 toe_listen_stop_event(void *arg __unused, struct tcpcb *tp)
 {
 	struct toedev *tod;
 #ifdef INVARIANTS
 	struct inpcb *inp = tptoinpcb(tp);
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(tp->t_state == TCPS_LISTEN,
 	    ("%s: t_state %s", __func__, tcpstates[tp->t_state]));
 
 	mtx_lock(&toedev_lock);
 	TAILQ_FOREACH(tod, &toedev_list, link)
 	    tod->tod_listen_stop(tod, tp);
 	mtx_unlock(&toedev_lock);
 }
 
 /*
  * Fill up a freshly allocated toedev struct with reasonable defaults.
  */
 void
 init_toedev(struct toedev *tod)
 {
 
 	tod->tod_softc = NULL;
 
 	/*
 	 * Provide no-op defaults so that the kernel can call any toedev
 	 * function without having to check whether the TOE driver supplied one
 	 * or not.
 	 */
 	tod->tod_connect = toedev_connect;
 	tod->tod_listen_start = toedev_listen_start;
 	tod->tod_listen_stop = toedev_listen_stop;
 	tod->tod_input = toedev_input;
 	tod->tod_rcvd = toedev_rcvd;
 	tod->tod_output = toedev_output;
 	tod->tod_send_rst = toedev_output;
 	tod->tod_send_fin = toedev_output;
 	tod->tod_pcb_detach = toedev_pcb_detach;
 	tod->tod_l2_update = toedev_l2_update;
 	tod->tod_route_redirect = toedev_route_redirect;
 	tod->tod_syncache_added = toedev_syncache_added;
 	tod->tod_syncache_removed = toedev_syncache_removed;
 	tod->tod_syncache_respond = toedev_syncache_respond;
 	tod->tod_offload_socket = toedev_offload_socket;
 	tod->tod_ctloutput = toedev_ctloutput;
 	tod->tod_tcp_info = toedev_tcp_info;
 	tod->tod_alloc_tls_session = toedev_alloc_tls_session;
 	tod->tod_pmtu_update = toedev_pmtu_update;
 }
 
 /*
  * Register an active TOE device with the system.  This allows it to receive
  * notifications from the kernel.
  */
 int
 register_toedev(struct toedev *tod)
 {
 	struct toedev *t;
 
 	mtx_lock(&toedev_lock);
 	TAILQ_FOREACH(t, &toedev_list, link) {
 		if (t == tod) {
 			mtx_unlock(&toedev_lock);
 			return (EEXIST);
 		}
 	}
 
 	TAILQ_INSERT_TAIL(&toedev_list, tod, link);
 	registered_toedevs++;
 	mtx_unlock(&toedev_lock);
 
 	inp_apply_all(&V_tcbinfo, toe_listen_start, tod);
 
 	return (0);
 }
 
 /*
  * Remove the TOE device from the global list of active TOE devices.  It is the
  * caller's responsibility to ensure that the TOE device is quiesced prior to
  * this call.
  */
 int
 unregister_toedev(struct toedev *tod)
 {
 	struct toedev *t, *t2;
 	int rc = ENODEV;
 
 	mtx_lock(&toedev_lock);
 	TAILQ_FOREACH_SAFE(t, &toedev_list, link, t2) {
 		if (t == tod) {
 			TAILQ_REMOVE(&toedev_list, tod, link);
 			registered_toedevs--;
 			rc = 0;
 			break;
 		}
 	}
 	KASSERT(registered_toedevs >= 0,
 	    ("%s: registered_toedevs (%d) < 0", __func__, registered_toedevs));
 	mtx_unlock(&toedev_lock);
 	return (rc);
 }
 
 void
 toe_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
     struct inpcb *inp, void *tod, void *todctx, uint8_t iptos)
 {
 
 	INP_RLOCK_ASSERT(inp);
 
 	(void )syncache_add(inc, to, th, inp, inp->inp_socket, NULL, tod,
 	    todctx, iptos, htons(0));
 }
 
 int
 toe_syncache_expand(struct in_conninfo *inc, struct tcpopt *to,
     struct tcphdr *th, struct socket **lsop)
 {
 
 	NET_EPOCH_ASSERT();
 
 	return (syncache_expand(inc, to, th, lsop, NULL, htons(0)));
 }
 
 /*
  * General purpose check to see if a 4-tuple is in use by the kernel.  If a TCP
  * header (presumably for an incoming SYN) is also provided, an existing 4-tuple
  * in TIME_WAIT may be assassinated freeing it up for re-use.
  *
  * Note that the TCP header must have been run through tcp_fields_to_host() or
  * equivalent.
  */
 int
 toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 
 	if (inc->inc_flags & INC_ISIPV6) {
 		inp = in6_pcblookup(&V_tcbinfo, &inc->inc6_faddr,
 		    inc->inc_fport, &inc->inc6_laddr, inc->inc_lport,
 		    INPLOOKUP_RLOCKPCB, ifp);
 	} else {
 		inp = in_pcblookup(&V_tcbinfo, inc->inc_faddr, inc->inc_fport,
 		    inc->inc_laddr, inc->inc_lport, INPLOOKUP_RLOCKPCB, ifp);
 	}
 	if (inp != NULL) {
 		INP_RLOCK_ASSERT(inp);
 
 		tp = intotcpcb(inp);
 		if (tp->t_state == TCPS_TIME_WAIT && th != NULL) {
 			if (!tcp_twcheck(inp, NULL, th, NULL, 0))
 				return (EADDRINUSE);
 		} else {
 			INP_RUNLOCK(inp);
 			return (EADDRINUSE);
 		}
 	}
 
 	return (0);
 }
 
 static void
 toe_lle_event(void *arg __unused, struct llentry *lle, int evt)
 {
 	struct toedev *tod;
 	struct ifnet *ifp;
 	struct sockaddr *sa;
 	uint8_t *lladdr;
 	uint16_t vid, pcp;
 	int family;
 	struct sockaddr_in6 sin6;
 
 	LLE_WLOCK_ASSERT(lle);
 
 	ifp = lltable_get_ifp(lle->lle_tbl);
 	family = lltable_get_af(lle->lle_tbl);
 
 	if (family != AF_INET && family != AF_INET6)
 		return;
 	/*
 	 * Not interested if the interface's TOE capability is not enabled.
 	 */
 	if ((family == AF_INET && !(ifp->if_capenable & IFCAP_TOE4)) ||
 	    (family == AF_INET6 && !(ifp->if_capenable & IFCAP_TOE6)))
 		return;
 
 	tod = TOEDEV(ifp);
 	if (tod == NULL)
 		return;
 
 	sa = (struct sockaddr *)&sin6;
 	lltable_fill_sa_entry(lle, sa);
 
 	vid = 0xfff;
 	pcp = 0;
 	if (evt != LLENTRY_RESOLVED) {
 		/*
 		 * LLENTRY_TIMEDOUT, LLENTRY_DELETED, LLENTRY_EXPIRED all mean
 		 * this entry is going to be deleted.
 		 */
 
 		lladdr = NULL;
 	} else {
 		KASSERT(lle->la_flags & LLE_VALID,
 		    ("%s: %p resolved but not valid?", __func__, lle));
 
 		lladdr = (uint8_t *)lle->ll_addr;
 		VLAN_TAG(ifp, &vid);
 		VLAN_PCP(ifp, &pcp);
 	}
 
 	tod->tod_l2_update(tod, ifp, sa, lladdr, EVL_MAKETAG(vid, pcp, 0));
 }
 
 /*
  * Returns 0 or EWOULDBLOCK on success (any other value is an error).  0 means
  * lladdr and vtag are valid on return, EWOULDBLOCK means the TOE driver's
  * tod_l2_update will be called later, when the entry is resolved or times out.
  */
 int
 toe_l2_resolve(struct toedev *tod, struct ifnet *ifp, struct sockaddr *sa,
     uint8_t *lladdr, uint16_t *vtag)
 {
 	int rc;
 	uint16_t vid, pcp;
 
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		rc = arpresolve(ifp, 0, NULL, sa, lladdr, NULL, NULL);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		rc = nd6_resolve(ifp, LLE_SF(AF_INET6, 0), NULL, sa, lladdr,
 		    NULL, NULL);
 		break;
 #endif
 	default:
 		return (EPROTONOSUPPORT);
 	}
 
 	if (rc == 0) {
 		vid = 0xfff;
 		pcp = 0;
 		if (ifp->if_type == IFT_L2VLAN) {
 			VLAN_TAG(ifp, &vid);
 			VLAN_PCP(ifp, &pcp);
 		} else if (ifp->if_pcp != IFNET_PCP_NONE) {
 			vid = 0;
 			pcp = ifp->if_pcp;
 		}
 		*vtag = EVL_MAKETAG(vid, pcp, 0);
 	}
 
 	return (rc);
 }
 
 void
 toe_connect_failed(struct toedev *tod, struct inpcb *inp, int err)
 {
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(inp);
 
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		struct tcpcb *tp = intotcpcb(inp);
 
 		KASSERT(tp->t_flags & TF_TOE,
 		    ("%s: tp %p not offloaded.", __func__, tp));
 
 		if (err == EAGAIN) {
 			/*
 			 * Temporary failure during offload, take this PCB back.
 			 * Detach from the TOE driver and do the rest of what
 			 * TCP's pru_connect would have done if the connection
 			 * wasn't offloaded.
 			 */
 
 			tod->tod_pcb_detach(tod, tp);
 			KASSERT(!(tp->t_flags & TF_TOE),
 			    ("%s: tp %p still offloaded.", __func__, tp));
 			tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 			if (tcp_output(tp) < 0)
 				INP_WLOCK(inp);	/* re-acquire */
 		} else {
 			tp = tcp_drop(tp, err);
 			if (tp == NULL)
 				INP_WLOCK(inp);	/* re-acquire */
 		}
 	}
 	INP_WLOCK_ASSERT(inp);
 }
 
 static int
 toecore_load(void)
 {
 
 	mtx_init(&toedev_lock, "toedev lock", NULL, MTX_DEF);
 	TAILQ_INIT(&toedev_list);
 
 	listen_start_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_start,
 	    toe_listen_start_event, NULL, EVENTHANDLER_PRI_ANY);
 	listen_stop_eh = EVENTHANDLER_REGISTER(tcp_offload_listen_stop,
 	    toe_listen_stop_event, NULL, EVENTHANDLER_PRI_ANY);
 	lle_event_eh = EVENTHANDLER_REGISTER(lle_event, toe_lle_event, NULL,
 	    EVENTHANDLER_PRI_ANY);
 
 	return (0);
 }
 
 static int
 toecore_unload(void)
 {
 
 	mtx_lock(&toedev_lock);
 	if (!TAILQ_EMPTY(&toedev_list)) {
 		mtx_unlock(&toedev_lock);
 		return (EBUSY);
 	}
 
 	EVENTHANDLER_DEREGISTER(tcp_offload_listen_start, listen_start_eh);
 	EVENTHANDLER_DEREGISTER(tcp_offload_listen_stop, listen_stop_eh);
 	EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh);
 
 	mtx_unlock(&toedev_lock);
 	mtx_destroy(&toedev_lock);
 
 	return (0);
 }
 
 static int
 toecore_mod_handler(module_t mod, int cmd, void *arg)
 {
 
 	if (cmd == MOD_LOAD)
 		return (toecore_load());
 
 	if (cmd == MOD_UNLOAD)
 		return (toecore_unload());
 
 	return (EOPNOTSUPP);
 }
 
 static moduledata_t mod_data= {
 	"toecore",
 	toecore_mod_handler,
 	0
 };
 
 MODULE_VERSION(toecore, 1);
 DECLARE_MODULE(toecore, mod_data, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/netinet6/frag6.c b/sys/netinet6/frag6.c
index 9f12d4d691b6..d634f869acd5 100644
--- a/sys/netinet6/frag6.c
+++ b/sys/netinet6/frag6.c
@@ -1,1067 +1,1068 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  * Copyright (c) 2019 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: frag6.c,v 1.33 2002/01/07 11:34:48 kjc Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/hash.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/icmp6.h>
 #include <netinet/in_systm.h>	/* For ECN definitions. */
 #include <netinet/ip.h>		/* For ECN definitions. */
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 /*
  * A "big picture" of how IPv6 fragment queues are all linked together.
  *
  * struct ip6qbucket ip6qb[...];			hashed buckets
  * ||||||||
  * |
  * +--- TAILQ(struct ip6q, packets) *q6;		tailq entries holding
  *      ||||||||					fragmented packets
  *      |						(1 per original packet)
  *      |
  *      +--- TAILQ(struct ip6asfrag, ip6q_frags) *af6;	tailq entries of IPv6
  *           |                                   *ip6af;fragment packets
  *           |						for one original packet
  *           + *mbuf
  */
 
 /* Reassembly headers are stored in hash buckets. */
 #define	IP6REASS_NHASH_LOG2	10
 #define	IP6REASS_NHASH		(1 << IP6REASS_NHASH_LOG2)
 #define	IP6REASS_HMASK		(IP6REASS_NHASH - 1)
 
 TAILQ_HEAD(ip6qhead, ip6q);
 struct ip6qbucket {
 	struct ip6qhead	packets;
 	struct mtx	lock;
 	int		count;
 };
 
 struct ip6asfrag {
 	TAILQ_ENTRY(ip6asfrag) ip6af_tq;
 	struct mbuf	*ip6af_m;
 	int		ip6af_offset;	/* Offset in ip6af_m to next header. */
 	int		ip6af_frglen;	/* Fragmentable part length. */
 	int		ip6af_off;	/* Fragment offset. */
 	bool		ip6af_mff;	/* More fragment bit in frag off. */
 };
 
 static MALLOC_DEFINE(M_FRAG6, "frag6", "IPv6 fragment reassembly header");
 
 #ifdef VIMAGE
 /* A flag to indicate if IPv6 fragmentation is initialized. */
 VNET_DEFINE_STATIC(bool,		frag6_on);
 #define	V_frag6_on			VNET(frag6_on)
 #endif
 
 /* System wide (global) maximum and count of packets in reassembly queues. */
 static int ip6_maxfrags;
 static u_int __exclusive_cache_line frag6_nfrags;
 
 /* Maximum and current packets in per-VNET reassembly queue. */
 VNET_DEFINE_STATIC(int,			ip6_maxfragpackets);
 VNET_DEFINE_STATIC(volatile u_int,	frag6_nfragpackets);
 #define	V_ip6_maxfragpackets		VNET(ip6_maxfragpackets)
 #define	V_frag6_nfragpackets		VNET(frag6_nfragpackets)
 
 /* Maximum per-VNET reassembly queues per bucket and fragments per packet. */
 VNET_DEFINE_STATIC(int,			ip6_maxfragbucketsize);
 VNET_DEFINE_STATIC(int,			ip6_maxfragsperpacket);
 #define	V_ip6_maxfragbucketsize		VNET(ip6_maxfragbucketsize)
 #define	V_ip6_maxfragsperpacket		VNET(ip6_maxfragsperpacket)
 
 /* Per-VNET reassembly queue buckets. */
 VNET_DEFINE_STATIC(struct ip6qbucket,	ip6qb[IP6REASS_NHASH]);
 VNET_DEFINE_STATIC(uint32_t,		ip6qb_hashseed);
 #define	V_ip6qb				VNET(ip6qb)
 #define	V_ip6qb_hashseed		VNET(ip6qb_hashseed)
 
 #define	IP6QB_LOCK(_b)		mtx_lock(&V_ip6qb[(_b)].lock)
 #define	IP6QB_TRYLOCK(_b)	mtx_trylock(&V_ip6qb[(_b)].lock)
 #define	IP6QB_LOCK_ASSERT(_b)	mtx_assert(&V_ip6qb[(_b)].lock, MA_OWNED)
 #define	IP6QB_UNLOCK(_b)	mtx_unlock(&V_ip6qb[(_b)].lock)
 #define	IP6QB_HEAD(_b)		(&V_ip6qb[(_b)].packets)
 
 /*
  * By default, limit the number of IP6 fragments across all reassembly
  * queues to  1/32 of the total number of mbuf clusters.
  *
  * Limit the total number of reassembly queues per VNET to the
  * IP6 fragment limit, but ensure the limit will not allow any bucket
  * to grow above 100 items. (The bucket limit is
  * IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
  * multiplier to reach a 100-item limit.)
  * The 100-item limit was chosen as brief testing seems to show that
  * this produces "reasonable" performance on some subset of systems
  * under DoS attack.
  */
 #define	IP6_MAXFRAGS		(nmbclusters / 32)
 #define	IP6_MAXFRAGPACKETS	(imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50))
 
 /*
  * Sysctls and helper function.
  */
 SYSCTL_DECL(_net_inet6_ip6);
 
 SYSCTL_UINT(_net_inet6_ip6, OID_AUTO, frag6_nfrags,
 	CTLFLAG_RD, &frag6_nfrags, 0,
 	"Global number of IPv6 fragments across all reassembly queues.");
 
 static void
 frag6_set_bucketsize(void)
 {
 	int i;
 
 	if ((i = V_ip6_maxfragpackets) > 0)
 		V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1);
 }
 
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags,
 	CTLFLAG_RW, &ip6_maxfrags, 0,
 	"Maximum allowed number of outstanding IPv6 packet fragments. "
 	"A value of 0 means no fragmented packets will be accepted, while "
 	"a value of -1 means no limit");
 
 static int
 sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = V_ip6_maxfragpackets;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || !req->newptr)
 		return (error);
 	V_ip6_maxfragpackets = val;
 	frag6_set_bucketsize();
 	return (0);
 }
 SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	NULL, 0, sysctl_ip6_maxfragpackets, "I",
 	"Default maximum number of outstanding fragmented IPv6 packets. "
 	"A value of 0 means no fragmented packets will be accepted, while a "
 	"a value of -1 means no limit");
 SYSCTL_UINT(_net_inet6_ip6, OID_AUTO, frag6_nfragpackets,
 	CTLFLAG_VNET | CTLFLAG_RD,
 	__DEVOLATILE(u_int *, &VNET_NAME(frag6_nfragpackets)), 0,
 	"Per-VNET number of IPv6 fragments across all reassembly queues.");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0,
 	"Maximum allowed number of fragments per packet");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0,
 	"Maximum number of reassembly queues per hash bucket");
 
 /*
  * Remove the IPv6 fragmentation header from the mbuf.
  */
 int
 ip6_deletefraghdr(struct mbuf *m, int offset, int wait __unused)
 {
 	struct ip6_hdr *ip6;
 
 	KASSERT(m->m_len >= offset + sizeof(struct ip6_frag),
 	    ("%s: ext headers not contigous in mbuf %p m_len %d >= "
 	    "offset %d + %zu\n", __func__, m, m->m_len, offset,
 	    sizeof(struct ip6_frag)));
 
 	/* Delete frag6 header. */
 	ip6 = mtod(m, struct ip6_hdr *);
 	bcopy(ip6, (char *)ip6 + sizeof(struct ip6_frag), offset);
 	m->m_data += sizeof(struct ip6_frag);
 	m->m_len -= sizeof(struct ip6_frag);
 	m->m_flags |= M_FRAGMENTED;
 
 	return (0);
 }
 
 /*
  * Free a fragment reassembly header and all associated datagrams.
  */
 static void
 frag6_freef(struct ip6q *q6, uint32_t bucket)
 {
 	struct ip6_hdr *ip6;
 	struct ip6asfrag *af6;
 	struct mbuf *m;
 
 	IP6QB_LOCK_ASSERT(bucket);
 
 	while ((af6 = TAILQ_FIRST(&q6->ip6q_frags)) != NULL) {
 		m = af6->ip6af_m;
 		TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq);
 
 		/*
 		 * Return ICMP time exceeded error for the 1st fragment.
 		 * Just free other fragments.
 		 */
 		if (af6->ip6af_off == 0 && m->m_pkthdr.rcvif != NULL) {
 			/* Adjust pointer. */
 			ip6 = mtod(m, struct ip6_hdr *);
 
 			/* Restore source and destination addresses. */
 			ip6->ip6_src = q6->ip6q_src;
 			ip6->ip6_dst = q6->ip6q_dst;
 
 			icmp6_error(m, ICMP6_TIME_EXCEEDED,
 			    ICMP6_TIME_EXCEED_REASSEMBLY, 0);
 		} else
 			m_freem(m);
 
 		free(af6, M_FRAG6);
 	}
 
 	TAILQ_REMOVE(IP6QB_HEAD(bucket), q6, ip6q_tq);
 	V_ip6qb[bucket].count--;
 	atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
 #ifdef MAC
 	mac_ip6q_destroy(q6);
 #endif
 	free(q6, M_FRAG6);
 	atomic_subtract_int(&V_frag6_nfragpackets, 1);
 }
 
 /*
  * Drain off all datagram fragments belonging to
  * the given network interface.
  */
 static void
 frag6_cleanup(void *arg __unused, struct ifnet *ifp)
 {
 	struct ip6qhead *head;
 	struct ip6q *q6;
 	struct ip6asfrag *af6;
 	uint32_t bucket;
 
 	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 #ifdef VIMAGE
 	/*
 	 * Skip processing if IPv6 reassembly is not initialised or
 	 * torn down by frag6_destroy().
 	 */
 	if (!V_frag6_on) {
 		CURVNET_RESTORE();
 		return;
 	}
 #endif
 
 	for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) {
 		IP6QB_LOCK(bucket);
 		head = IP6QB_HEAD(bucket);
 		/* Scan fragment list. */
 		TAILQ_FOREACH(q6, head, ip6q_tq) {
 			TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) {
 				/* Clear no longer valid rcvif pointer. */
 				if (af6->ip6af_m->m_pkthdr.rcvif == ifp)
 					af6->ip6af_m->m_pkthdr.rcvif = NULL;
 			}
 		}
 		IP6QB_UNLOCK(bucket);
 	}
 	CURVNET_RESTORE();
 }
 EVENTHANDLER_DEFINE(ifnet_departure_event, frag6_cleanup, NULL, 0);
 
 /*
  * Like in RFC2460, in RFC8200, fragment and reassembly rules do not agree with
  * each other, in terms of next header field handling in fragment header.
  * While the sender will use the same value for all of the fragmented packets,
  * receiver is suggested not to check for consistency.
  *
  * Fragment rules (p18,p19):
  *	(2)  A Fragment header containing:
  *	The Next Header value that identifies the first header
  *	after the Per-Fragment headers of the original packet.
  *		-> next header field is same for all fragments
  *
  * Reassembly rule (p20):
  *	The Next Header field of the last header of the Per-Fragment
  *	headers is obtained from the Next Header field of the first
  *	fragment's Fragment header.
  *		-> should grab it from the first fragment only
  *
  * The following note also contradicts with fragment rule - no one is going to
  * send different fragment with different next header field.
  *
  * Additional note (p22) [not an error]:
  *	The Next Header values in the Fragment headers of different
  *	fragments of the same original packet may differ.  Only the value
  *	from the Offset zero fragment packet is used for reassembly.
  *		-> should grab it from the first fragment only
  *
  * There is no explicit reason given in the RFC.  Historical reason maybe?
  */
 /*
  * Fragment input.
  */
 int
 frag6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m, *t;
 	struct ip6_hdr *ip6;
 	struct ip6_frag *ip6f;
 	struct ip6qhead *head;
 	struct ip6q *q6;
 	struct ip6asfrag *af6, *ip6af, *af6tmp;
 	struct in6_ifaddr *ia6;
 	struct ifnet *dstifp, *srcifp;
 	uint32_t hashkey[(sizeof(struct in6_addr) * 2 +
 		    sizeof(ip6f->ip6f_ident)) / sizeof(uint32_t)];
 	uint32_t bucket, *hashkeyp;
 	int fragoff, frgpartlen;	/* Must be larger than uint16_t. */
 	int nxt, offset, plen;
 	uint8_t ecn, ecn0;
 	bool only_frag;
 #ifdef RSS
 	struct ip6_direct_ctx *ip6dc;
 	struct m_tag *mtag;
 #endif
 
 	m = *mp;
 	offset = *offp;
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_len < offset + sizeof(struct ip6_frag)) {
 		m = m_pullup(m, offset + sizeof(struct ip6_frag));
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			*mp = NULL;
 			return (IPPROTO_DONE);
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	dstifp = NULL;
 	/* Find the destination interface of the packet. */
 	ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
 	if (ia6 != NULL)
 		dstifp = ia6->ia_ifp;
 
 	/* Jumbo payload cannot contain a fragment header. */
 	if (ip6->ip6_plen == 0) {
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset);
 		in6_ifstat_inc(dstifp, ifs6_reass_fail);
 		*mp = NULL;
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Check whether fragment packet's fragment length is a
 	 * multiple of 8 octets (unless it is the last one).
 	 * sizeof(struct ip6_frag) == 8
 	 * sizeof(struct ip6_hdr) = 40
 	 */
 	ip6f = (struct ip6_frag *)((caddr_t)ip6 + offset);
 	if ((ip6f->ip6f_offlg & IP6F_MORE_FRAG) &&
 	    (((ntohs(ip6->ip6_plen) - offset) & 0x7) != 0)) {
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 		    offsetof(struct ip6_hdr, ip6_plen));
 		in6_ifstat_inc(dstifp, ifs6_reass_fail);
 		*mp = NULL;
 		return (IPPROTO_DONE);
 	}
 
 	IP6STAT_INC(ip6s_fragments);
 	in6_ifstat_inc(dstifp, ifs6_reass_reqd);
 
 	/*
 	 * Handle "atomic" fragments (offset and m bit set to 0) upfront,
 	 * unrelated to any reassembly.  We need to remove the frag hdr
 	 * which is ugly.
 	 * See RFC 6946 and section 4.5 of RFC 8200.
 	 */
 	if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) {
 		IP6STAT_INC(ip6s_atomicfrags);
 		nxt = ip6f->ip6f_nxt;
 		/*
 		 * Set nxt(-hdr field value) to the original value.
 		 * We cannot just set ip6->ip6_nxt as there might be
 		 * an unfragmentable part with extension headers and
 		 * we must update the last one.
 		 */
 		m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t),
 		    (caddr_t)&nxt);
 		ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) -
 		    sizeof(struct ip6_frag));
 		if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0)
 			goto dropfrag2;
 		m->m_pkthdr.len -= sizeof(struct ip6_frag);
 		in6_ifstat_inc(dstifp, ifs6_reass_ok);
 		*mp = m;
 		return (nxt);
 	}
 
 	/* Offset now points to data portion. */
 	offset += sizeof(struct ip6_frag);
 
 	/* Get fragment length and discard 0-byte fragments. */
 	frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset;
 	if (frgpartlen == 0) {
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 		    offsetof(struct ip6_hdr, ip6_plen));
 		in6_ifstat_inc(dstifp, ifs6_reass_fail);
 		IP6STAT_INC(ip6s_fragdropped);
 		*mp = NULL;
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Enforce upper bound on number of fragments for the entire system.
 	 * If maxfrag is 0, never accept fragments.
 	 * If maxfrag is -1, accept all fragments without limitation.
 	 */
 	if (ip6_maxfrags < 0)
 		;
 	else if (atomic_load_int(&frag6_nfrags) >= (u_int)ip6_maxfrags)
 		goto dropfrag2;
 
 	/*
 	 * Validate that a full header chain to the ULP is present in the
 	 * packet containing the first fragment as per RFC RFC7112 and
 	 * RFC 8200 pages 18,19:
 	 * The first fragment packet is composed of:
 	 * (3)  Extension headers, if any, and the Upper-Layer header.  These
 	 *      headers must be in the first fragment.  ...
 	 */
 	fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK);
 	/* XXX TODO.  thj has D16851 open for this. */
 	/* Send ICMPv6 4,3 in case of violation. */
 
 	/* Store receive network interface pointer for later. */
 	srcifp = m->m_pkthdr.rcvif;
 
 	/* Generate a hash value for fragment bucket selection. */
 	hashkeyp = hashkey;
 	memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr));
 	hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
 	memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr));
 	hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
 	*hashkeyp = ip6f->ip6f_ident;
 	bucket = jenkins_hash32(hashkey, nitems(hashkey), V_ip6qb_hashseed);
 	bucket &= IP6REASS_HMASK;
 	IP6QB_LOCK(bucket);
 	head = IP6QB_HEAD(bucket);
 
 	TAILQ_FOREACH(q6, head, ip6q_tq)
 		if (ip6f->ip6f_ident == q6->ip6q_ident &&
 		    IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) &&
 		    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst)
 #ifdef MAC
 		    && mac_ip6q_match(m, q6)
 #endif
 		    )
 			break;
 
 	only_frag = false;
 	if (q6 == NULL) {
 		/* A first fragment to arrive creates a reassembly queue. */
 		only_frag = true;
 
 		/*
 		 * Enforce upper bound on number of fragmented packets
 		 * for which we attempt reassembly;
 		 * If maxfragpackets is 0, never accept fragments.
 		 * If maxfragpackets is -1, accept all fragments without
 		 * limitation.
 		 */
 		if (V_ip6_maxfragpackets < 0)
 			;
 		else if (V_ip6qb[bucket].count >= V_ip6_maxfragbucketsize ||
 		    atomic_load_int(&V_frag6_nfragpackets) >=
 		    (u_int)V_ip6_maxfragpackets)
 			goto dropfrag;
 
 		/* Allocate IPv6 fragement packet queue entry. */
 		q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FRAG6,
 		    M_NOWAIT | M_ZERO);
 		if (q6 == NULL)
 			goto dropfrag;
 #ifdef MAC
 		if (mac_ip6q_init(q6, M_NOWAIT) != 0) {
 			free(q6, M_FRAG6);
 			goto dropfrag;
 		}
 		mac_ip6q_create(m, q6);
 #endif
 		atomic_add_int(&V_frag6_nfragpackets, 1);
 
 		/* ip6q_nxt will be filled afterwards, from 1st fragment. */
 		TAILQ_INIT(&q6->ip6q_frags);
 		q6->ip6q_ident	= ip6f->ip6f_ident;
 		q6->ip6q_ttl	= IPV6_FRAGTTL;
 		q6->ip6q_src	= ip6->ip6_src;
 		q6->ip6q_dst	= ip6->ip6_dst;
 		q6->ip6q_ecn	= IPV6_ECN(ip6);
 		q6->ip6q_unfrglen = -1;	/* The 1st fragment has not arrived. */
 
 		/* Add the fragemented packet to the bucket. */
 		TAILQ_INSERT_HEAD(head, q6, ip6q_tq);
 		V_ip6qb[bucket].count++;
 	}
 
 	/*
 	 * If it is the 1st fragment, record the length of the
 	 * unfragmentable part and the next header of the fragment header.
 	 * Assume the first 1st fragement to arrive will be correct.
 	 * We do not have any duplicate checks here yet so another packet
 	 * with fragoff == 0 could come and overwrite the ip6q_unfrglen
 	 * and worse, the next header, at any time.
 	 */
 	if (fragoff == 0 && q6->ip6q_unfrglen == -1) {
 		q6->ip6q_unfrglen = offset - sizeof(struct ip6_hdr) -
 		    sizeof(struct ip6_frag);
 		q6->ip6q_nxt = ip6f->ip6f_nxt;
 		/* XXX ECN? */
 	}
 
 	/*
 	 * Check that the reassembled packet would not exceed 65535 bytes
 	 * in size.
 	 * If it would exceed, discard the fragment and return an ICMP error.
 	 */
 	if (q6->ip6q_unfrglen >= 0) {
 		/* The 1st fragment has already arrived. */
 		if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) {
 			if (only_frag) {
 				TAILQ_REMOVE(head, q6, ip6q_tq);
 				V_ip6qb[bucket].count--;
 				atomic_subtract_int(&V_frag6_nfragpackets, 1);
 #ifdef MAC
 				mac_ip6q_destroy(q6);
 #endif
 				free(q6, M_FRAG6);
 			}
 			IP6QB_UNLOCK(bucket);
 			icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 			    offset - sizeof(struct ip6_frag) +
 			    offsetof(struct ip6_frag, ip6f_offlg));
 			*mp = NULL;
 			return (IPPROTO_DONE);
 		}
 	} else if (fragoff + frgpartlen > IPV6_MAXPACKET) {
 		if (only_frag) {
 			TAILQ_REMOVE(head, q6, ip6q_tq);
 			V_ip6qb[bucket].count--;
 			atomic_subtract_int(&V_frag6_nfragpackets, 1);
 #ifdef MAC
 			mac_ip6q_destroy(q6);
 #endif
 			free(q6, M_FRAG6);
 		}
 		IP6QB_UNLOCK(bucket);
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 		    offset - sizeof(struct ip6_frag) +
 		    offsetof(struct ip6_frag, ip6f_offlg));
 		*mp = NULL;
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * If it is the first fragment, do the above check for each
 	 * fragment already stored in the reassembly queue.
 	 */
 	if (fragoff == 0 && !only_frag) {
 		TAILQ_FOREACH_SAFE(af6, &q6->ip6q_frags, ip6af_tq, af6tmp) {
 			if (q6->ip6q_unfrglen + af6->ip6af_off +
 			    af6->ip6af_frglen > IPV6_MAXPACKET) {
 				struct ip6_hdr *ip6err;
 				struct mbuf *merr;
 				int erroff;
 
 				merr = af6->ip6af_m;
 				erroff = af6->ip6af_offset;
 
 				/* Dequeue the fragment. */
 				TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq);
 				q6->ip6q_nfrag--;
 				atomic_subtract_int(&frag6_nfrags, 1);
 				free(af6, M_FRAG6);
 
 				/* Set a valid receive interface pointer. */
 				merr->m_pkthdr.rcvif = srcifp;
 
 				/* Adjust pointer. */
 				ip6err = mtod(merr, struct ip6_hdr *);
 
 				/*
 				 * Restore source and destination addresses
 				 * in the erroneous IPv6 header.
 				 */
 				ip6err->ip6_src = q6->ip6q_src;
 				ip6err->ip6_dst = q6->ip6q_dst;
 
 				icmp6_error(merr, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    erroff - sizeof(struct ip6_frag) +
 				    offsetof(struct ip6_frag, ip6f_offlg));
 			}
 		}
 	}
 
 	/* Allocate an IPv6 fragement queue entry for this fragmented part. */
 	ip6af = (struct ip6asfrag *)malloc(sizeof(struct ip6asfrag), M_FRAG6,
 	    M_NOWAIT | M_ZERO);
 	if (ip6af == NULL)
 		goto dropfrag;
 	ip6af->ip6af_mff = (ip6f->ip6f_offlg & IP6F_MORE_FRAG) ? true : false;
 	ip6af->ip6af_off = fragoff;
 	ip6af->ip6af_frglen = frgpartlen;
 	ip6af->ip6af_offset = offset;
 	ip6af->ip6af_m = m;
 
 	if (only_frag) {
 		/*
 		 * Do a manual insert rather than a hard-to-understand cast
 		 * to a different type relying on data structure order to work.
 		 */
 		TAILQ_INSERT_HEAD(&q6->ip6q_frags, ip6af, ip6af_tq);
 		goto postinsert;
 	}
 
 	/* Do duplicate, condition, and boundry checks. */
 	/*
 	 * Handle ECN by comparing this segment with the first one;
 	 * if CE is set, do not lose CE.
 	 * Drop if CE and not-ECT are mixed for the same packet.
 	 */
 	ecn = IPV6_ECN(ip6);
 	ecn0 = q6->ip6q_ecn;
 	if (ecn == IPTOS_ECN_CE) {
 		if (ecn0 == IPTOS_ECN_NOTECT) {
 			free(ip6af, M_FRAG6);
 			goto dropfrag;
 		}
 		if (ecn0 != IPTOS_ECN_CE)
 			q6->ip6q_ecn = IPTOS_ECN_CE;
 	}
 	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) {
 		free(ip6af, M_FRAG6);
 		goto dropfrag;
 	}
 
 	/* Find a fragmented part which begins after this one does. */
 	TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq)
 		if (af6->ip6af_off > ip6af->ip6af_off)
 			break;
 
 	/*
 	 * If the incoming framgent overlaps some existing fragments in
 	 * the reassembly queue, drop both the new fragment and the
 	 * entire reassembly queue.  However, if the new fragment
 	 * is an exact duplicate of an existing fragment, only silently
 	 * drop the existing fragment and leave the fragmentation queue
 	 * unchanged, as allowed by the RFC.  (RFC 8200, 4.5)
 	 */
 	if (af6 != NULL)
 		af6tmp = TAILQ_PREV(af6, ip6fraghead, ip6af_tq);
 	else
 		af6tmp = TAILQ_LAST(&q6->ip6q_frags, ip6fraghead);
 	if (af6tmp != NULL) {
 		if (af6tmp->ip6af_off + af6tmp->ip6af_frglen -
 		    ip6af->ip6af_off > 0) {
 			if (af6tmp->ip6af_off != ip6af->ip6af_off ||
 			    af6tmp->ip6af_frglen != ip6af->ip6af_frglen)
 				frag6_freef(q6, bucket);
 			free(ip6af, M_FRAG6);
 			goto dropfrag;
 		}
 	}
 	if (af6 != NULL) {
 		if (ip6af->ip6af_off + ip6af->ip6af_frglen -
 		    af6->ip6af_off > 0) {
 			if (af6->ip6af_off != ip6af->ip6af_off ||
 			    af6->ip6af_frglen != ip6af->ip6af_frglen)
 				frag6_freef(q6, bucket);
 			free(ip6af, M_FRAG6);
 			goto dropfrag;
 		}
 	}
 
 #ifdef MAC
 	mac_ip6q_update(m, q6);
 #endif
 
 	/*
 	 * Stick new segment in its place; check for complete reassembly.
 	 * If not complete, check fragment limit.  Move to front of packet
 	 * queue, as we are the most recently active fragmented packet.
 	 */
 	if (af6 != NULL)
 		TAILQ_INSERT_BEFORE(af6, ip6af, ip6af_tq);
 	else
 		TAILQ_INSERT_TAIL(&q6->ip6q_frags, ip6af, ip6af_tq);
 postinsert:
 	atomic_add_int(&frag6_nfrags, 1);
 	q6->ip6q_nfrag++;
 
 	plen = 0;
 	TAILQ_FOREACH(af6, &q6->ip6q_frags, ip6af_tq) {
 		if (af6->ip6af_off != plen) {
 			if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
 				IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag);
 				frag6_freef(q6, bucket);
 			}
 			IP6QB_UNLOCK(bucket);
 			*mp = NULL;
 			return (IPPROTO_DONE);
 		}
 		plen += af6->ip6af_frglen;
 	}
 	af6 = TAILQ_LAST(&q6->ip6q_frags, ip6fraghead);
 	if (af6->ip6af_mff) {
 		if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
 			IP6STAT_ADD(ip6s_fragdropped, q6->ip6q_nfrag);
 			frag6_freef(q6, bucket);
 		}
 		IP6QB_UNLOCK(bucket);
 		*mp = NULL;
 		return (IPPROTO_DONE);
 	}
 
 	/* Reassembly is complete; concatenate fragments. */
 	ip6af = TAILQ_FIRST(&q6->ip6q_frags);
 	t = m = ip6af->ip6af_m;
 	TAILQ_REMOVE(&q6->ip6q_frags, ip6af, ip6af_tq);
 	while ((af6 = TAILQ_FIRST(&q6->ip6q_frags)) != NULL) {
 		m->m_pkthdr.csum_flags &=
 		    af6->ip6af_m->m_pkthdr.csum_flags;
 		m->m_pkthdr.csum_data +=
 		    af6->ip6af_m->m_pkthdr.csum_data;
 
 		TAILQ_REMOVE(&q6->ip6q_frags, af6, ip6af_tq);
 		t = m_last(t);
 		m_adj(af6->ip6af_m, af6->ip6af_offset);
 		m_demote_pkthdr(af6->ip6af_m);
 		m_cat(t, af6->ip6af_m);
 		free(af6, M_FRAG6);
 	}
 
 	while (m->m_pkthdr.csum_data & 0xffff0000)
 		m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
 		    (m->m_pkthdr.csum_data >> 16);
 
 	/* Adjust offset to point where the original next header starts. */
 	offset = ip6af->ip6af_offset - sizeof(struct ip6_frag);
 	free(ip6af, M_FRAG6);
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_plen = htons((u_short)plen + offset - sizeof(struct ip6_hdr));
 	if (q6->ip6q_ecn == IPTOS_ECN_CE)
 		ip6->ip6_flow |= htonl(IPTOS_ECN_CE << 20);
 	nxt = q6->ip6q_nxt;
 
 	TAILQ_REMOVE(head, q6, ip6q_tq);
 	V_ip6qb[bucket].count--;
 	atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
 
 	ip6_deletefraghdr(m, offset, M_NOWAIT);
 
 	/* Set nxt(-hdr field value) to the original value. */
 	m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t),
 	    (caddr_t)&nxt);
 
 #ifdef MAC
 	mac_ip6q_reassemble(q6, m);
 	mac_ip6q_destroy(q6);
 #endif
 	free(q6, M_FRAG6);
 	atomic_subtract_int(&V_frag6_nfragpackets, 1);
 
 	if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */
 
 		plen = 0;
 		for (t = m; t; t = t->m_next)
 			plen += t->m_len;
 		m->m_pkthdr.len = plen;
 		/* Set a valid receive interface pointer. */
 		m->m_pkthdr.rcvif = srcifp;
 	}
 
 #ifdef RSS
 	mtag = m_tag_alloc(MTAG_ABI_IPV6, IPV6_TAG_DIRECT, sizeof(*ip6dc),
 	    M_NOWAIT);
 	if (mtag == NULL)
 		goto dropfrag;
 
 	ip6dc = (struct ip6_direct_ctx *)(mtag + 1);
 	ip6dc->ip6dc_nxt = nxt;
 	ip6dc->ip6dc_off = offset;
 
 	m_tag_prepend(m, mtag);
 #endif
 
 	IP6QB_UNLOCK(bucket);
 	IP6STAT_INC(ip6s_reassembled);
 	in6_ifstat_inc(dstifp, ifs6_reass_ok);
 
 #ifdef RSS
 	/* Queue/dispatch for reprocessing. */
 	netisr_dispatch(NETISR_IPV6_DIRECT, m);
 	*mp = NULL;
 	return (IPPROTO_DONE);
 #endif
 
 	/* Tell launch routine the next header. */
 	*mp = m;
 	*offp = offset;
 
 	return (nxt);
 
 dropfrag:
 	IP6QB_UNLOCK(bucket);
 dropfrag2:
 	in6_ifstat_inc(dstifp, ifs6_reass_fail);
 	IP6STAT_INC(ip6s_fragdropped);
 	m_freem(m);
 	*mp = NULL;
 	return (IPPROTO_DONE);
 }
 
 /*
  * IPv6 reassembling timer processing;
  * if a timer expires on a reassembly queue, discard it.
  */
 static struct callout frag6_callout;
 static void
 frag6_slowtimo(void *arg __unused)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct ip6qhead *head;
 	struct ip6q *q6, *q6tmp;
 	uint32_t bucket;
 
 	if (atomic_load_int(&frag6_nfrags) == 0)
 		goto done;
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) {
 			if (V_ip6qb[bucket].count == 0)
 				continue;
 			IP6QB_LOCK(bucket);
 			head = IP6QB_HEAD(bucket);
 			TAILQ_FOREACH_SAFE(q6, head, ip6q_tq, q6tmp)
 				if (--q6->ip6q_ttl == 0) {
 					IP6STAT_ADD(ip6s_fragtimeout,
 						q6->ip6q_nfrag);
 					/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 					frag6_freef(q6, bucket);
 				}
 			/*
 			 * If we are over the maximum number of fragments
 			 * (due to the limit being lowered), drain off
 			 * enough to get down to the new limit.
 			 * Note that we drain all reassembly queues if
 			 * maxfragpackets is 0 (fragmentation is disabled),
 			 * and do not enforce a limit when maxfragpackets
 			 * is negative.
 			 */
 			while ((V_ip6_maxfragpackets == 0 ||
 			    (V_ip6_maxfragpackets > 0 &&
 			    V_ip6qb[bucket].count > V_ip6_maxfragbucketsize)) &&
 			    (q6 = TAILQ_LAST(head, ip6qhead)) != NULL) {
 				IP6STAT_ADD(ip6s_fragoverflow, q6->ip6q_nfrag);
 				/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 				frag6_freef(q6, bucket);
 			}
 			IP6QB_UNLOCK(bucket);
 		}
 		/*
 		 * If we are still over the maximum number of fragmented
 		 * packets, drain off enough to get down to the new limit.
 		 */
 		bucket = 0;
 		while (V_ip6_maxfragpackets >= 0 &&
 		    atomic_load_int(&V_frag6_nfragpackets) >
 		    (u_int)V_ip6_maxfragpackets) {
 			IP6QB_LOCK(bucket);
 			q6 = TAILQ_LAST(IP6QB_HEAD(bucket), ip6qhead);
 			if (q6 != NULL) {
 				IP6STAT_ADD(ip6s_fragoverflow, q6->ip6q_nfrag);
 				/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 				frag6_freef(q6, bucket);
 			}
 			IP6QB_UNLOCK(bucket);
 			bucket = (bucket + 1) % IP6REASS_NHASH;
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 done:
 	callout_reset_sbt(&frag6_callout, SBT_1MS * 500, SBT_1MS * 10,
 	    frag6_slowtimo, NULL, 0);
 }
 
 static void
 frag6_slowtimo_init(void *arg __unused)
 {
 
 	callout_init(&frag6_callout, 1);
 	callout_reset_sbt(&frag6_callout, SBT_1MS * 500, SBT_1MS * 10,
 	    frag6_slowtimo, NULL, 0);
 }
 SYSINIT(frag6, SI_SUB_VNET_DONE, SI_ORDER_ANY, frag6_slowtimo_init, NULL);
 
 /*
  * Eventhandler to adjust limits in case nmbclusters change.
  */
 static void
 frag6_change(void *tag)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	ip6_maxfrags = IP6_MAXFRAGS;
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
 		frag6_set_bucketsize();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 /*
  * Initialise reassembly queue and fragment identifier.
  */
 void
 frag6_init(void)
 {
 	uint32_t bucket;
 
 	V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
 	frag6_set_bucketsize();
 	for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) {
 		TAILQ_INIT(IP6QB_HEAD(bucket));
 		mtx_init(&V_ip6qb[bucket].lock, "ip6qb", NULL, MTX_DEF);
 		V_ip6qb[bucket].count = 0;
 	}
 	V_ip6qb_hashseed = arc4random();
 	V_ip6_maxfragsperpacket = 64;
 #ifdef VIMAGE
 	V_frag6_on = true;
 #endif
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	ip6_maxfrags = IP6_MAXFRAGS;
 	EVENTHANDLER_REGISTER(nmbclusters_change,
 	    frag6_change, NULL, EVENTHANDLER_PRI_ANY);
 }
 
 /*
  * Drain off all datagram fragments.
  */
 static void
 frag6_drain_one(void)
 {
 	struct ip6q *q6;
 	uint32_t bucket;
 
 	for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) {
 		IP6QB_LOCK(bucket);
 		while ((q6 = TAILQ_FIRST(IP6QB_HEAD(bucket))) != NULL) {
 			IP6STAT_INC(ip6s_fragdropped);
 			/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
 			frag6_freef(q6, bucket);
 		}
 		IP6QB_UNLOCK(bucket);
 	}
 }
 
 void
 frag6_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		frag6_drain_one();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
 #ifdef VIMAGE
 /*
  * Clear up IPv6 reassembly structures.
  */
 void
 frag6_destroy(void)
 {
 	uint32_t bucket;
 
 	frag6_drain_one();
 	V_frag6_on = false;
 	for (bucket = 0; bucket < IP6REASS_NHASH; bucket++) {
 		KASSERT(V_ip6qb[bucket].count == 0,
 		    ("%s: V_ip6qb[%d] (%p) count not 0 (%d)", __func__,
 		    bucket, &V_ip6qb[bucket], V_ip6qb[bucket].count));
 		mtx_destroy(&V_ip6qb[bucket].lock);
 	}
 }
 #endif
diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c
index 2cd54abbb76e..5c94a0c56be1 100644
--- a/sys/netinet6/icmp6.c
+++ b/sys/netinet6/icmp6.c
@@ -1,2745 +1,2746 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: icmp6.c,v 1.211 2001/04/04 05:56:20 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_icmp.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define	MBUF_PRIVATE	/* XXXRW: Optimisation tries to avoid M_EXT mbufs */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/tcp_var.h>
 
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/send.h>
 
 extern ip6proto_ctlinput_t	*ip6_ctlprotox[];
 
 VNET_PCPUSTAT_DEFINE(struct icmp6stat, icmp6stat);
 VNET_PCPUSTAT_SYSINIT(icmp6stat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(icmp6stat);
 #endif /* VIMAGE */
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 VNET_DECLARE(int, icmp6errppslim);
 VNET_DEFINE_STATIC(int, icmp6errpps_count) = 0;
 VNET_DEFINE_STATIC(struct timeval, icmp6errppslim_last);
 VNET_DECLARE(int, icmp6_nodeinfo);
 
 #define	V_ripcbinfo			VNET(ripcbinfo)
 #define	V_icmp6errppslim		VNET(icmp6errppslim)
 #define	V_icmp6errpps_count		VNET(icmp6errpps_count)
 #define	V_icmp6errppslim_last		VNET(icmp6errppslim_last)
 #define	V_icmp6_nodeinfo		VNET(icmp6_nodeinfo)
 
 static void icmp6_errcount(int, int);
 static int icmp6_rip6_input(struct mbuf **, int);
 static void icmp6_reflect(struct mbuf *, size_t);
 static const char *icmp6_redirect_diag(struct in6_addr *,
 	struct in6_addr *, struct in6_addr *);
 static struct mbuf *ni6_input(struct mbuf *, int, struct prison *);
 static struct mbuf *ni6_nametodns(const char *, int, int);
 static int ni6_dnsmatch(const char *, int, const char *, int);
 static int ni6_addrs(struct icmp6_nodeinfo *, struct mbuf *,
 			  struct ifnet **, struct in6_addr *);
 static int ni6_store_addrs(struct icmp6_nodeinfo *, struct icmp6_nodeinfo *,
 				struct ifnet *, int);
 static int icmp6_notify_error(struct mbuf **, int, int);
 
 /*
  * Kernel module interface for updating icmp6stat.  The argument is an index
  * into icmp6stat treated as an array of u_quad_t.  While this encodes the
  * general layout of icmp6stat into the caller, it doesn't encode its
  * location, so that future changes to add, for example, per-CPU stats
  * support won't cause binary compatibility problems for kernel modules.
  */
 void
 kmod_icmp6stat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(icmp6stat)[statnum], 1);
 }
 
 static void
 icmp6_errcount(int type, int code)
 {
 	switch (type) {
 	case ICMP6_DST_UNREACH:
 		switch (code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 			ICMP6STAT_INC(icp6s_odst_unreach_noroute);
 			return;
 		case ICMP6_DST_UNREACH_ADMIN:
 			ICMP6STAT_INC(icp6s_odst_unreach_admin);
 			return;
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			ICMP6STAT_INC(icp6s_odst_unreach_beyondscope);
 			return;
 		case ICMP6_DST_UNREACH_ADDR:
 			ICMP6STAT_INC(icp6s_odst_unreach_addr);
 			return;
 		case ICMP6_DST_UNREACH_NOPORT:
 			ICMP6STAT_INC(icp6s_odst_unreach_noport);
 			return;
 		}
 		break;
 	case ICMP6_PACKET_TOO_BIG:
 		ICMP6STAT_INC(icp6s_opacket_too_big);
 		return;
 	case ICMP6_TIME_EXCEEDED:
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 			ICMP6STAT_INC(icp6s_otime_exceed_transit);
 			return;
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			ICMP6STAT_INC(icp6s_otime_exceed_reassembly);
 			return;
 		}
 		break;
 	case ICMP6_PARAM_PROB:
 		switch (code) {
 		case ICMP6_PARAMPROB_HEADER:
 			ICMP6STAT_INC(icp6s_oparamprob_header);
 			return;
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			ICMP6STAT_INC(icp6s_oparamprob_nextheader);
 			return;
 		case ICMP6_PARAMPROB_OPTION:
 			ICMP6STAT_INC(icp6s_oparamprob_option);
 			return;
 		}
 		break;
 	case ND_REDIRECT:
 		ICMP6STAT_INC(icp6s_oredirect);
 		return;
 	}
 	ICMP6STAT_INC(icp6s_ounknown);
 }
 
 /*
  * A wrapper function for icmp6_error() necessary when the erroneous packet
  * may not contain enough scope zone information.
  */
 void
 icmp6_error2(struct mbuf *m, int type, int code, int param,
     struct ifnet *ifp)
 {
 	struct ip6_hdr *ip6;
 
 	if (ifp == NULL)
 		return;
 
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	if (in6_setscope(&ip6->ip6_src, ifp, NULL) != 0)
 		return;
 	if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
 		return;
 
 	icmp6_error(m, type, code, param);
 }
 
 /*
  * Generate an error packet of type error in response to bad IP6 packet.
  */
 void
 icmp6_error(struct mbuf *m, int type, int code, int param)
 {
 	struct ip6_hdr *oip6, *nip6;
 	struct icmp6_hdr *icmp6;
 	struct epoch_tracker et;
 	u_int preplen;
 	int off;
 	int nxt;
 
 	ICMP6STAT_INC(icp6s_error);
 
 	/* count per-type-code statistics */
 	icmp6_errcount(type, code);
 
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m->m_flags & M_DECRYPTED) {
 		ICMP6STAT_INC(icp6s_canterror);
 		goto freeit;
 	}
 #endif
 
 	if (m->m_len < sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, sizeof(struct ip6_hdr));
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	oip6 = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * If the destination address of the erroneous packet is a multicast
 	 * address, or the packet was sent using link-layer multicast,
 	 * we should basically suppress sending an error (RFC 2463, Section
 	 * 2.4).
 	 * We have two exceptions (the item e.2 in that section):
 	 * - the Packet Too Big message can be sent for path MTU discovery.
 	 * - the Parameter Problem Message that can be allowed an icmp6 error
 	 *   in the option type field.  This check has been done in
 	 *   ip6_unknown_opt(), so we can just check the type and code.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST) ||
 	     IN6_IS_ADDR_MULTICAST(&oip6->ip6_dst)) &&
 	    (type != ICMP6_PACKET_TOO_BIG &&
 	     (type != ICMP6_PARAM_PROB ||
 	      code != ICMP6_PARAMPROB_OPTION)))
 		goto freeit;
 
 	/*
 	 * RFC 2463, 2.4 (e.5): source address check.
 	 * XXX: the case of anycast source?
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&oip6->ip6_src) ||
 	    IN6_IS_ADDR_MULTICAST(&oip6->ip6_src))
 		goto freeit;
 
 	/*
 	 * If we are about to send ICMPv6 against ICMPv6 error/redirect,
 	 * don't do it.
 	 */
 	nxt = -1;
 	off = ip6_lasthdr(m, 0, IPPROTO_IPV6, &nxt);
 	if (off >= 0 && nxt == IPPROTO_ICMPV6) {
 		struct icmp6_hdr *icp;
 
 		if (m->m_len < off + sizeof(struct icmp6_hdr)) {
 			m = m_pullup(m, off + sizeof(struct icmp6_hdr));
 			if (m == NULL) {
 				IP6STAT_INC(ip6s_exthdrtoolong);
 				return;
 			}
 		}
 		oip6 = mtod(m, struct ip6_hdr *);
 		icp = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 
 		if (icp->icmp6_type < ICMP6_ECHO_REQUEST ||
 		    icp->icmp6_type == ND_REDIRECT) {
 			/*
 			 * ICMPv6 error
 			 * Special case: for redirect (which is
 			 * informational) we must not send icmp6 error.
 			 */
 			ICMP6STAT_INC(icp6s_canterror);
 			goto freeit;
 		} else {
 			/* ICMPv6 informational - send the error */
 		}
 	} else {
 		/* non-ICMPv6 - send the error */
 	}
 
 	/* Finally, do rate limitation check. */
 	if (icmp6_ratelimit(&oip6->ip6_src, type, code)) {
 		ICMP6STAT_INC(icp6s_toofreq);
 		goto freeit;
 	}
 
 	/*
 	 * OK, ICMP6 can be generated.
 	 */
 
 	if (m->m_pkthdr.len >= ICMPV6_PLD_MAXLEN)
 		m_adj(m, ICMPV6_PLD_MAXLEN - m->m_pkthdr.len);
 
 	preplen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 	M_PREPEND(m, preplen, M_NOWAIT);	/* FIB is also copied over. */
 	if (m == NULL) {
 		nd6log((LOG_DEBUG, "ENOBUFS in icmp6_error %d\n", __LINE__));
 		return;
 	}
 
 	nip6 = mtod(m, struct ip6_hdr *);
 	nip6->ip6_src  = oip6->ip6_src;
 	nip6->ip6_dst  = oip6->ip6_dst;
 
 	in6_clearscope(&oip6->ip6_src);
 	in6_clearscope(&oip6->ip6_dst);
 
 	icmp6 = (struct icmp6_hdr *)(nip6 + 1);
 	icmp6->icmp6_type = type;
 	icmp6->icmp6_code = code;
 	icmp6->icmp6_pptr = htonl((u_int32_t)param);
 
 	ICMP6STAT_INC(icp6s_outhist[type]);
 	NET_EPOCH_ENTER(et);
 	icmp6_reflect(m, sizeof(struct ip6_hdr)); /* header order: IPv6 - ICMPv6 */
 	NET_EPOCH_EXIT(et);
 
 	return;
 
   freeit:
 	/*
 	 * If we can't tell whether or not we can generate ICMP6, free it.
 	 */
 	m_freem(m);
 }
 
 int
 icmp6_errmap(const struct icmp6_hdr *icmp6)
 {
 
 	switch (icmp6->icmp6_type) {
 	case ICMP6_DST_UNREACH:
 		switch (icmp6->icmp6_code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 		case ICMP6_DST_UNREACH_ADDR:
 			return (EHOSTUNREACH);
 		case ICMP6_DST_UNREACH_NOPORT:
 		case ICMP6_DST_UNREACH_ADMIN:
 			return (ECONNREFUSED);
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 			return (ENOPROTOOPT);
 		default:
 			return (0);	/* Shouldn't happen. */
 		}
 	case ICMP6_PACKET_TOO_BIG:
 		return (EMSGSIZE);
 	case ICMP6_TIME_EXCEEDED:
 		switch (icmp6->icmp6_code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 			return (EHOSTUNREACH);
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			return (0);
 		default:
 			return (0);	/* Shouldn't happen. */
 		}
 	case ICMP6_PARAM_PROB:
 		switch (icmp6->icmp6_code) {
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			return (ECONNREFUSED);
 		case ICMP6_PARAMPROB_HEADER:
 		case ICMP6_PARAMPROB_OPTION:
 			return (ENOPROTOOPT);
 		default:
 			return (0);	/* Shouldn't happen. */
 		}
 	default:
 		return (0);
 	}
 }
 
 /*
  * Process a received ICMP6 message.
  */
 int
 icmp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m, *n;
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6, *nip6;
 	struct icmp6_hdr *icmp6, *nicmp6;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 	int code, error, icmp6len, ip6len, noff, off, sum;
 
 	NET_EPOCH_ASSERT();
 
 	m = *mp;
 	off = *offp;
 
 	if (m->m_len < off + sizeof(struct icmp6_hdr)) {
 		m = m_pullup(m, off + sizeof(struct icmp6_hdr));
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			*mp = m;
 			return (IPPROTO_DONE);
 		}
 	}
 
 	/*
 	 * Locate icmp6 structure in mbuf, and check
 	 * that not corrupted and of at least minimum length
 	 */
 
 	icmp6len = m->m_pkthdr.len - off;
 	if (icmp6len < sizeof(struct icmp6_hdr)) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		goto freeit;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ifp = m->m_pkthdr.rcvif;
 	/*
 	 * Check multicast group membership.
 	 * Note: SSM filters are not applied for ICMPv6 traffic.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		struct in6_multi	*inm;
 
 		inm = in6m_lookup(ifp, &ip6->ip6_dst);
 		if (inm == NULL) {
 			IP6STAT_INC(ip6s_notmember);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 			goto freeit;
 		}
 	}
 
 	/* Calculate the checksum. */
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 	code = icmp6->icmp6_code;
 	if ((sum = in6_cksum(m, IPPROTO_ICMPV6, off, icmp6len)) != 0) {
 		nd6log((LOG_ERR,
 		    "ICMP6 checksum error(%d|%x) %s\n",
 		    icmp6->icmp6_type, sum,
 		    ip6_sprintf(ip6bufs, &ip6->ip6_src)));
 		ICMP6STAT_INC(icp6s_checksum);
 		goto freeit;
 	}
 
 	ICMP6STAT_INC(icp6s_inhist[icmp6->icmp6_type]);
 	icmp6_ifstat_inc(ifp, ifs6_in_msg);
 	if (icmp6->icmp6_type < ICMP6_INFOMSG_MASK)
 		icmp6_ifstat_inc(ifp, ifs6_in_error);
 
 	ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
 	switch (icmp6->icmp6_type) {
 	case ICMP6_DST_UNREACH:
 		icmp6_ifstat_inc(ifp, ifs6_in_dstunreach);
 		switch (code) {
 		case ICMP6_DST_UNREACH_ADMIN:
 			icmp6_ifstat_inc(ifp, ifs6_in_adminprohib);
 		case ICMP6_DST_UNREACH_NOROUTE:
 		case ICMP6_DST_UNREACH_ADDR:
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 		case ICMP6_DST_UNREACH_NOPORT:
 			goto deliver;
 		default:
 			goto badcode;
 		}
 	case ICMP6_PACKET_TOO_BIG:
 		icmp6_ifstat_inc(ifp, ifs6_in_pkttoobig);
 		/*
 		 * Validation is made in icmp6_mtudisc_update.
 		 * Updating the path MTU will be done after examining
 		 * intermediate extension headers.
 		 */
 		goto deliver;
 	case ICMP6_TIME_EXCEEDED:
 		icmp6_ifstat_inc(ifp, ifs6_in_timeexceed);
 		switch (code) {
 		case ICMP6_TIME_EXCEED_TRANSIT:
 		case ICMP6_TIME_EXCEED_REASSEMBLY:
 			goto deliver;
 		default:
 			goto badcode;
 		}
 	case ICMP6_PARAM_PROB:
 		icmp6_ifstat_inc(ifp, ifs6_in_paramprob);
 		switch (code) {
 		case ICMP6_PARAMPROB_NEXTHEADER:
 		case ICMP6_PARAMPROB_HEADER:
 		case ICMP6_PARAMPROB_OPTION:
 			goto deliver;
 		default:
 			goto badcode;
 		}
 	case ICMP6_ECHO_REQUEST:
 		icmp6_ifstat_inc(ifp, ifs6_in_echo);
 		if (code != 0)
 			goto badcode;
 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) {
 			/* Give up remote */
 			break;
 		}
 		if (!M_WRITABLE(n)
 		 || n->m_len < off + sizeof(struct icmp6_hdr)) {
 			struct mbuf *n0 = n;
 			int n0len;
 
 			CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) <= MHLEN);
 			n = m_gethdr(M_NOWAIT, n0->m_type);
 			if (n == NULL) {
 				/* Give up remote */
 				m_freem(n0);
 				break;
 			}
 
 			m_move_pkthdr(n, n0);	/* FIB copied. */
 			n0len = n0->m_pkthdr.len;	/* save for use below */
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			noff = sizeof(struct ip6_hdr);
 			/* new mbuf contains only ipv6+icmpv6 headers */
 			n->m_len = noff + sizeof(struct icmp6_hdr);
 			/*
 			 * Adjust mbuf.  ip6_plen will be adjusted in
 			 * ip6_output().
 			 */
 			m_adj(n0, off + sizeof(struct icmp6_hdr));
 			/* recalculate complete packet size */
 			n->m_pkthdr.len = n0len + (noff - off);
 			n->m_next = n0;
 		} else {
 			if (n->m_len < off + sizeof(*nicmp6)) {
 				n = m_pullup(n, off + sizeof(*nicmp6));
 				if (n == NULL) {
 					IP6STAT_INC(ip6s_exthdrtoolong);
 					break;
 				}
 			}
 			nicmp6 = (struct icmp6_hdr *)(mtod(n, caddr_t) + off);
 			noff = off;
 		}
 		if (n) {
 			nicmp6->icmp6_type = ICMP6_ECHO_REPLY;
 			nicmp6->icmp6_code = 0;
 			ICMP6STAT_INC(icp6s_reflect);
 			ICMP6STAT_INC(icp6s_outhist[ICMP6_ECHO_REPLY]);
 			icmp6_reflect(n, noff);
 		}
 		break;
 
 	case ICMP6_ECHO_REPLY:
 		icmp6_ifstat_inc(ifp, ifs6_in_echoreply);
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case MLD_LISTENER_QUERY:
 	case MLD_LISTENER_REPORT:
 	case MLD_LISTENER_DONE:
 	case MLDV2_LISTENER_REPORT:
 		/*
 		 * Drop MLD traffic which is not link-local, has a hop limit
 		 * of greater than 1 hop, or which does not have the
 		 * IPv6 HBH Router Alert option.
 		 * As IPv6 HBH options are stripped in ip6_input() we must
 		 * check an mbuf header flag.
 		 * XXX Should we also sanity check that these messages
 		 * were directed to a link-local multicast prefix?
 		 */
 		if ((ip6->ip6_hlim != 1) || (m->m_flags & M_RTALERT_MLD) == 0)
 			goto freeit;
 		if (mld_input(&m, off, icmp6len) != 0) {
 			*mp = NULL;
 			return (IPPROTO_DONE);
 		}
 		/* m stays. */
 		break;
 
 	case ICMP6_WRUREQUEST:	/* ICMP6_FQDN_QUERY */
 	    {
 		enum { WRU, FQDN } mode;
 		struct prison *pr;
 
 		if (!V_icmp6_nodeinfo)
 			break;
 
 		if (icmp6len == sizeof(struct icmp6_hdr) + 4)
 			mode = WRU;
 		else if (icmp6len >= sizeof(struct icmp6_nodeinfo))
 			mode = FQDN;
 		else
 			goto badlen;
 
 		pr = NULL;
 		sx_slock(&allprison_lock);
 		TAILQ_FOREACH(pr, &allprison, pr_list)
 			if (pr->pr_vnet == ifp->if_vnet)
 				break; 
 		sx_sunlock(&allprison_lock);
 		if (pr == NULL)
 			pr = curthread->td_ucred->cr_prison;
 		if (mode == FQDN) {
 			if (m->m_len < off + sizeof(struct icmp6_nodeinfo)) {
 				m = m_pullup(m, off +
 				    sizeof(struct icmp6_nodeinfo));
 				if (m == NULL) {
 					IP6STAT_INC(ip6s_exthdrtoolong);
 					*mp = m;
 					return (IPPROTO_DONE);
 				}
 			}
 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 			if (n)
 				n = ni6_input(n, off, pr);
 			/* XXX meaningless if n == NULL */
 			noff = sizeof(struct ip6_hdr);
 		} else {
 			u_char *p;
 			int maxhlen, hlen;
 
 			/*
 			 * XXX: this combination of flags is pointless,
 			 * but should we keep this for compatibility?
 			 */
 			if ((V_icmp6_nodeinfo & (ICMP6_NODEINFO_FQDNOK |
 			    ICMP6_NODEINFO_TMPADDROK)) !=
 			    (ICMP6_NODEINFO_FQDNOK | ICMP6_NODEINFO_TMPADDROK))
 				break;
 
 			if (code != 0)
 				goto badcode;
 
 			CTASSERT(sizeof(*nip6) + sizeof(*nicmp6) + 4 <= MHLEN);
 			n = m_gethdr(M_NOWAIT, m->m_type);
 			if (n == NULL) {
 				/* Give up remote */
 				break;
 			}
 			if (!m_dup_pkthdr(n, m, M_NOWAIT)) {
 				/*
 				 * Previous code did a blind M_COPY_PKTHDR
 				 * and said "just for rcvif".  If true, then
 				 * we could tolerate the dup failing (due to
 				 * the deep copy of the tag chain).  For now
 				 * be conservative and just fail.
 				 */
 				m_free(n);
 				n = NULL;
 				break;
 			}
 			/*
 			 * Copy IPv6 and ICMPv6 only.
 			 */
 			nip6 = mtod(n, struct ip6_hdr *);
 			bcopy(ip6, nip6, sizeof(struct ip6_hdr));
 			nicmp6 = (struct icmp6_hdr *)(nip6 + 1);
 			bcopy(icmp6, nicmp6, sizeof(struct icmp6_hdr));
 			p = (u_char *)(nicmp6 + 1);
 			bzero(p, 4);
 
 			maxhlen = M_TRAILINGSPACE(n) -
 			    (sizeof(*nip6) + sizeof(*nicmp6) + 4);
 			mtx_lock(&pr->pr_mtx);
 			hlen = strlen(pr->pr_hostname);
 			if (maxhlen > hlen)
 				maxhlen = hlen;
 			/* meaningless TTL */
 			bcopy(pr->pr_hostname, p + 4, maxhlen);
 			mtx_unlock(&pr->pr_mtx);
 			noff = sizeof(struct ip6_hdr);
 			n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 				sizeof(struct icmp6_hdr) + 4 + maxhlen;
 			nicmp6->icmp6_type = ICMP6_WRUREPLY;
 			nicmp6->icmp6_code = 0;
 		}
 		if (n) {
 			ICMP6STAT_INC(icp6s_reflect);
 			ICMP6STAT_INC(icp6s_outhist[ICMP6_WRUREPLY]);
 			icmp6_reflect(n, noff);
 		}
 		break;
 	    }
 
 	case ICMP6_WRUREPLY:
 		if (code != 0)
 			goto badcode;
 		break;
 
 	case ND_ROUTER_SOLICIT:
 		icmp6_ifstat_inc(ifp, ifs6_in_routersolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_solicit))
 			goto badlen;
 		if (send_sendso_input_hook != NULL) {
 			if (m->m_len < off + icmp6len) {
 				m = m_pullup(m, off + icmp6len);
 				if (m == NULL) {
 					IP6STAT_INC(ip6s_exthdrtoolong);
 					*mp = NULL;
 					return (IPPROTO_DONE);
 				}
 			}
 			error = send_sendso_input_hook(m, ifp, SND_IN, ip6len);
 			if (error == 0) {
 				m = NULL;
 				goto freeit;
 			}
 		}
 		n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		nd6_rs_input(m, off, icmp6len);
 		m = n;
 		if (m == NULL)
 			goto freeit;
 		break;
 
 	case ND_ROUTER_ADVERT:
 		icmp6_ifstat_inc(ifp, ifs6_in_routeradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_router_advert))
 			goto badlen;
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(m, ifp, SND_IN, ip6len);
 			if (error == 0) {
 				m = NULL;
 				goto freeit;
 			}
 		}
 		n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		nd6_ra_input(m, off, icmp6len);
 		m = n;
 		if (m == NULL)
 			goto freeit;
 		break;
 
 	case ND_NEIGHBOR_SOLICIT:
 		icmp6_ifstat_inc(ifp, ifs6_in_neighborsolicit);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_solicit))
 			goto badlen;
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(m, ifp, SND_IN, ip6len);
 			if (error == 0) {
 				m = NULL;
 				goto freeit;
 			}
 		}
 		n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		nd6_ns_input(m, off, icmp6len);
 		m = n;
 		if (m == NULL)
 			goto freeit;
 		break;
 
 	case ND_NEIGHBOR_ADVERT:
 		icmp6_ifstat_inc(ifp, ifs6_in_neighboradvert);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_neighbor_advert))
 			goto badlen;
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(m, ifp, SND_IN, ip6len);
 			if (error == 0) {
 				m = NULL;
 				goto freeit;
 			}
 		}
 		n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		nd6_na_input(m, off, icmp6len);
 		m = n;
 		if (m == NULL)
 			goto freeit;
 		break;
 
 	case ND_REDIRECT:
 		icmp6_ifstat_inc(ifp, ifs6_in_redirect);
 		if (code != 0)
 			goto badcode;
 		if (icmp6len < sizeof(struct nd_redirect))
 			goto badlen;
 		if (send_sendso_input_hook != NULL) {
 			error = send_sendso_input_hook(m, ifp, SND_IN, ip6len);
 			if (error == 0) {
 				m = NULL;
 				goto freeit;
 			}
 		}
 		n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		icmp6_redirect_input(m, off);
 		m = n;
 		if (m == NULL)
 			goto freeit;
 		break;
 
 	case ICMP6_ROUTER_RENUMBERING:
 		if (code != ICMP6_ROUTER_RENUMBERING_COMMAND &&
 		    code != ICMP6_ROUTER_RENUMBERING_RESULT)
 			goto badcode;
 		if (icmp6len < sizeof(struct icmp6_router_renum))
 			goto badlen;
 		break;
 
 	default:
 		nd6log((LOG_DEBUG,
 		    "icmp6_input: unknown type %d(src=%s, dst=%s, ifid=%d)\n",
 		    icmp6->icmp6_type, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 		    ifp ? ifp->if_index : 0));
 		if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST) {
 			/* ICMPv6 error: MUST deliver it by spec... */
 			goto deliver;
 		} else {
 			/* ICMPv6 informational: MUST not deliver */
 			break;
 		}
 	deliver:
 		if (icmp6_notify_error(&m, off, icmp6len) != 0) {
 			/* In this case, m should've been freed. */
 			*mp = NULL;
 			return (IPPROTO_DONE);
 		}
 		break;
 
 	badcode:
 		ICMP6STAT_INC(icp6s_badcode);
 		break;
 
 	badlen:
 		ICMP6STAT_INC(icp6s_badlen);
 		break;
 	}
 
 	/* deliver the packet to appropriate sockets */
 	icmp6_rip6_input(&m, *offp);
 
 	*mp = m;
 	return (IPPROTO_DONE);
 
  freeit:
 	m_freem(m);
 	*mp = NULL;
 	return (IPPROTO_DONE);
 }
 
 static int
 icmp6_notify_error(struct mbuf **mp, int off, int icmp6len)
 {
 	struct mbuf *m;
 	struct icmp6_hdr *icmp6;
 	struct ip6_hdr *eip6;
 	u_int32_t notifymtu;
 	struct sockaddr_in6 icmp6src, icmp6dst;
 
 	m = *mp;
 
 	if (icmp6len < sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr)) {
 		ICMP6STAT_INC(icp6s_tooshort);
 		goto freeit;
 	}
 
 	if (m->m_len < off + sizeof(*icmp6) + sizeof(struct ip6_hdr)) {
 		m = m_pullup(m, off + sizeof(*icmp6) + sizeof(struct ip6_hdr));
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			*mp = m;
 			return (-1);
 		}
 	}
 	icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 	eip6 = (struct ip6_hdr *)(icmp6 + 1);
 	bzero(&icmp6dst, sizeof(icmp6dst));
 
 	/* Detect the upper level protocol */
 	{
 		u_int8_t nxt = eip6->ip6_nxt;
 		int eoff = off + sizeof(struct icmp6_hdr) +
 		    sizeof(struct ip6_hdr);
 		struct ip6ctlparam ip6cp;
 		int icmp6type = icmp6->icmp6_type;
 		struct ip6_frag *fh;
 		struct ip6_rthdr *rth;
 		struct ip6_rthdr0 *rth0;
 		int rthlen;
 
 		while (1) { /* XXX: should avoid infinite loop explicitly? */
 			struct ip6_ext *eh;
 
 			switch (nxt) {
 			case IPPROTO_HOPOPTS:
 			case IPPROTO_DSTOPTS:
 			case IPPROTO_AH:
 				if (m->m_len < eoff + sizeof(struct ip6_ext)) {
 					m = m_pullup(m, eoff +
 					    sizeof(struct ip6_ext));
 					if (m == NULL) {
 						IP6STAT_INC(ip6s_exthdrtoolong);
 						*mp = m;
 						return (-1);
 					}
 				}
 				eh = (struct ip6_ext *)
 				    (mtod(m, caddr_t) + eoff);
 				if (nxt == IPPROTO_AH)
 					eoff += (eh->ip6e_len + 2) << 2;
 				else
 					eoff += (eh->ip6e_len + 1) << 3;
 				nxt = eh->ip6e_nxt;
 				break;
 			case IPPROTO_ROUTING:
 				/*
 				 * When the erroneous packet contains a
 				 * routing header, we should examine the
 				 * header to determine the final destination.
 				 * Otherwise, we can't properly update
 				 * information that depends on the final
 				 * destination (e.g. path MTU).
 				 */
 				if (m->m_len < eoff + sizeof(*rth)) {
 					m = m_pullup(m, eoff + sizeof(*rth));
 					if (m == NULL) {
 						IP6STAT_INC(ip6s_exthdrtoolong);
 						*mp = m;
 						return (-1);
 					}
 				}
 				rth = (struct ip6_rthdr *)
 				    (mtod(m, caddr_t) + eoff);
 				rthlen = (rth->ip6r_len + 1) << 3;
 				/*
 				 * XXX: currently there is no
 				 * officially defined type other
 				 * than type-0.
 				 * Note that if the segment left field
 				 * is 0, all intermediate hops must
 				 * have been passed.
 				 */
 				if (rth->ip6r_segleft &&
 				    rth->ip6r_type == IPV6_RTHDR_TYPE_0) {
 					int hops;
 
 					if (m->m_len < eoff + rthlen) {
 						m = m_pullup(m, eoff + rthlen);
 						if (m == NULL) {
 							IP6STAT_INC(
 							    ip6s_exthdrtoolong);
 							*mp = m;
 							return (-1);
 						}
 					}
 					rth0 = (struct ip6_rthdr0 *)
 					    (mtod(m, caddr_t) + eoff);
 
 					/* just ignore a bogus header */
 					if ((rth0->ip6r0_len % 2) == 0 &&
 					    (hops = rth0->ip6r0_len/2))
 						icmp6dst.sin6_addr = *((struct in6_addr *)(rth0 + 1) + (hops - 1));
 				}
 				eoff += rthlen;
 				nxt = rth->ip6r_nxt;
 				break;
 			case IPPROTO_FRAGMENT:
 				if (m->m_len < eoff + sizeof(struct ip6_frag)) {
 					m = m_pullup(m, eoff +
 					    sizeof(struct ip6_frag));
 					if (m == NULL) {
 						IP6STAT_INC(ip6s_exthdrtoolong);
 						*mp = m;
 						return (-1);
 					}
 				}
 				fh = (struct ip6_frag *)(mtod(m, caddr_t) +
 				    eoff);
 				/*
 				 * Data after a fragment header is meaningless
 				 * unless it is the first fragment, but
 				 * we'll go to the notify label for path MTU
 				 * discovery.
 				 */
 				if (fh->ip6f_offlg & IP6F_OFF_MASK)
 					goto notify;
 
 				eoff += sizeof(struct ip6_frag);
 				nxt = fh->ip6f_nxt;
 				break;
 			default:
 				/*
 				 * This case includes ESP and the No Next
 				 * Header.  In such cases going to the notify
 				 * label does not have any meaning
 				 * (i.e. ctlfunc will be NULL), but we go
 				 * anyway since we might have to update
 				 * path MTU information.
 				 */
 				goto notify;
 			}
 		}
 	  notify:
 		icmp6 = (struct icmp6_hdr *)(mtod(m, caddr_t) + off);
 
 		/*
 		 * retrieve parameters from the inner IPv6 header, and convert
 		 * them into sockaddr structures.
 		 * XXX: there is no guarantee that the source or destination
 		 * addresses of the inner packet are in the same scope as
 		 * the addresses of the icmp packet.  But there is no other
 		 * way to determine the zone.
 		 */
 		eip6 = (struct ip6_hdr *)(icmp6 + 1);
 
 		icmp6dst.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6dst.sin6_family = AF_INET6;
 		if (IN6_IS_ADDR_UNSPECIFIED(&icmp6dst.sin6_addr))
 			icmp6dst.sin6_addr = eip6->ip6_dst;
 		if (in6_setscope(&icmp6dst.sin6_addr, m->m_pkthdr.rcvif, NULL))
 			goto freeit;
 		bzero(&icmp6src, sizeof(icmp6src));
 		icmp6src.sin6_len = sizeof(struct sockaddr_in6);
 		icmp6src.sin6_family = AF_INET6;
 		icmp6src.sin6_addr = eip6->ip6_src;
 		if (in6_setscope(&icmp6src.sin6_addr, m->m_pkthdr.rcvif, NULL))
 			goto freeit;
 		icmp6src.sin6_flowinfo =
 		    (eip6->ip6_flow & IPV6_FLOWLABEL_MASK);
 
 		ip6cp.ip6c_m = m;
 		ip6cp.ip6c_icmp6 = icmp6;
 		ip6cp.ip6c_ip6 = (struct ip6_hdr *)(icmp6 + 1);
 		ip6cp.ip6c_off = eoff;
 		ip6cp.ip6c_finaldst = &icmp6dst;
 		ip6cp.ip6c_src = &icmp6src;
 		ip6cp.ip6c_nxt = nxt;
 
 		if (icmp6type == ICMP6_PACKET_TOO_BIG) {
 			notifymtu = ntohl(icmp6->icmp6_mtu);
 			ip6cp.ip6c_cmdarg = (void *)&notifymtu;
 			icmp6_mtudisc_update(&ip6cp, 1);	/*XXX*/
 		}
 
 		if (ip6_ctlprotox[nxt] != NULL)
 			ip6_ctlprotox[nxt](&ip6cp);
 	}
 	*mp = m;
 	return (0);
 
   freeit:
 	m_freem(m);
 	*mp = NULL;
 	return (-1);
 }
 
 void
 icmp6_mtudisc_update(struct ip6ctlparam *ip6cp, int validated)
 {
 	struct in6_addr *dst = &ip6cp->ip6c_finaldst->sin6_addr;
 	struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
 	struct mbuf *m = ip6cp->ip6c_m;	/* will be necessary for scope issue */
 	u_int mtu = ntohl(icmp6->icmp6_mtu);
 	struct in_conninfo inc;
 	uint32_t max_mtu;
 
 #if 0
 	/*
 	 * RFC2460 section 5, last paragraph.
 	 * even though minimum link MTU for IPv6 is IPV6_MMTU,
 	 * we may see ICMPv6 too big with mtu < IPV6_MMTU
 	 * due to packet translator in the middle.
 	 * see ip6_output() and ip6_getpmtu() "alwaysfrag" case for
 	 * special handling.
 	 */
 	if (mtu < IPV6_MMTU)
 		return;
 #endif
 
 	/*
 	 * we reject ICMPv6 too big with abnormally small value.
 	 * XXX what is the good definition of "abnormally small"?
 	 */
 	if (mtu < sizeof(struct ip6_hdr) + sizeof(struct ip6_frag) + 8)
 		return;
 
 	if (!validated)
 		return;
 
 	/*
 	 * In case the suggested mtu is less than IPV6_MMTU, we
 	 * only need to remember that it was for above mentioned
 	 * "alwaysfrag" case.
 	 * Try to be as close to the spec as possible.
 	 */
 	if (mtu < IPV6_MMTU)
 		mtu = IPV6_MMTU - 8;
 
 	bzero(&inc, sizeof(inc));
 	inc.inc_fibnum = M_GETFIB(m);
 	inc.inc_flags |= INC_ISIPV6;
 	inc.inc6_faddr = *dst;
 	if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL))
 		return;
 
 	max_mtu = tcp_hc_getmtu(&inc);
 	if (max_mtu == 0)
 		max_mtu = tcp_maxmtu6(&inc, NULL);
 
 	if (mtu < max_mtu) {
 		tcp_hc_updatemtu(&inc, mtu);
 		ICMP6STAT_INC(icp6s_pmtuchg);
 	}
 }
 
 /*
  * Process a Node Information Query packet, based on
  * draft-ietf-ipngwg-icmp-name-lookups-07.
  *
  * Spec incompatibilities:
  * - IPv6 Subject address handling
  * - IPv4 Subject address handling support missing
  * - Proxy reply (answer even if it's not for me)
  * - joins NI group address at in6_ifattach() time only, does not cope
  *   with hostname changes by sethostname(3)
  */
 static struct mbuf *
 ni6_input(struct mbuf *m, int off, struct prison *pr)
 {
 	struct icmp6_nodeinfo *ni6, *nni6;
 	struct mbuf *n = NULL;
 	u_int16_t qtype;
 	int subjlen;
 	int replylen = sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 	struct ni_reply_fqdn *fqdn;
 	int addrs;		/* for NI_QTYPE_NODEADDR */
 	struct ifnet *ifp = NULL; /* for NI_QTYPE_NODEADDR */
 	struct in6_addr in6_subj; /* subject address */
 	struct ip6_hdr *ip6;
 	int oldfqdn = 0;	/* if 1, return pascal string (03 draft) */
 	char *subj = NULL;
 	struct in6_ifaddr *ia6 = NULL;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off);
 
 	/*
 	 * Validate IPv6 source address.
 	 * The default configuration MUST be to refuse answering queries from
 	 * global-scope addresses according to RFC4602.
 	 * Notes:
 	 *  - it's not very clear what "refuse" means; this implementation
 	 *    simply drops it.
 	 *  - it's not very easy to identify global-scope (unicast) addresses
 	 *    since there are many prefixes for them.  It should be safer
 	 *    and in practice sufficient to check "all" but loopback and
 	 *    link-local (note that site-local unicast was deprecated and
 	 *    ULA is defined as global scope-wise)
 	 */
 	if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_GLOBALOK) == 0 &&
 	    !IN6_IS_ADDR_LOOPBACK(&ip6->ip6_src) &&
 	    !IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src))
 		goto bad;
 
 	/*
 	 * Validate IPv6 destination address.
 	 *
 	 * The Responder must discard the Query without further processing
 	 * unless it is one of the Responder's unicast or anycast addresses, or
 	 * a link-local scope multicast address which the Responder has joined.
 	 * [RFC4602, Section 5.]
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (!IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
 			goto bad;
 		/* else it's a link-local multicast, fine */
 	} else {		/* unicast or anycast */
 		ia6 = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
 		if (ia6 == NULL)
 			goto bad; /* XXX impossible */
 
 		if ((ia6->ia6_flags & IN6_IFF_TEMPORARY) &&
 		    !(V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK)) {
 			nd6log((LOG_DEBUG, "ni6_input: ignore node info to "
 				"a temporary address in %s:%d",
 			       __FILE__, __LINE__));
 			goto bad;
 		}
 	}
 
 	/* validate query Subject field. */
 	qtype = ntohs(ni6->ni_qtype);
 	subjlen = m->m_pkthdr.len - off - sizeof(struct icmp6_nodeinfo);
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 	case NI_QTYPE_SUPTYPES:
 		/* 07 draft */
 		if (ni6->ni_code == ICMP6_NI_SUBJ_FQDN && subjlen == 0)
 			break;
 		/* FALLTHROUGH */
 	case NI_QTYPE_FQDN:
 	case NI_QTYPE_NODEADDR:
 	case NI_QTYPE_IPV4ADDR:
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 #if ICMP6_NI_SUBJ_IPV6 != 0
 		case 0:
 #endif
 			/*
 			 * backward compatibility - try to accept 03 draft
 			 * format, where no Subject is present.
 			 */
 			if (qtype == NI_QTYPE_FQDN && ni6->ni_code == 0 &&
 			    subjlen == 0) {
 				oldfqdn++;
 				break;
 			}
 #if ICMP6_NI_SUBJ_IPV6 != 0
 			if (ni6->ni_code != ICMP6_NI_SUBJ_IPV6)
 				goto bad;
 #endif
 
 			if (subjlen != sizeof(struct in6_addr))
 				goto bad;
 
 			/*
 			 * Validate Subject address.
 			 *
 			 * Not sure what exactly "address belongs to the node"
 			 * means in the spec, is it just unicast, or what?
 			 *
 			 * At this moment we consider Subject address as
 			 * "belong to the node" if the Subject address equals
 			 * to the IPv6 destination address; validation for
 			 * IPv6 destination address should have done enough
 			 * check for us.
 			 *
 			 * We do not do proxy at this moment.
 			 */
 			m_copydata(m, off + sizeof(struct icmp6_nodeinfo),
 			    subjlen, (caddr_t)&in6_subj);
 			if (in6_setscope(&in6_subj, m->m_pkthdr.rcvif, NULL))
 				goto bad;
 
 			subj = (char *)&in6_subj;
 			if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &in6_subj))
 				break;
 
 			/*
 			 * XXX if we are to allow other cases, we should really
 			 * be careful about scope here.
 			 * basically, we should disallow queries toward IPv6
 			 * destination X with subject Y,
 			 * if scope(X) > scope(Y).
 			 * if we allow scope(X) > scope(Y), it will result in
 			 * information leakage across scope boundary.
 			 */
 			goto bad;
 
 		case ICMP6_NI_SUBJ_FQDN:
 			/*
 			 * Validate Subject name with gethostname(3).
 			 *
 			 * The behavior may need some debate, since:
 			 * - we are not sure if the node has FQDN as
 			 *   hostname (returned by gethostname(3)).
 			 * - the code does wildcard match for truncated names.
 			 *   however, we are not sure if we want to perform
 			 *   wildcard match, if gethostname(3) side has
 			 *   truncated hostname.
 			 */
 			mtx_lock(&pr->pr_mtx);
 			n = ni6_nametodns(pr->pr_hostname,
 			    strlen(pr->pr_hostname), 0);
 			mtx_unlock(&pr->pr_mtx);
 			if (!n || n->m_next || n->m_len == 0)
 				goto bad;
 			if (m->m_len < off + sizeof(struct icmp6_nodeinfo) +
 			    subjlen) {
 				m = m_pullup(m, off +
 				    sizeof(struct icmp6_nodeinfo) + subjlen);
 				if (m == NULL) {
 					IP6STAT_INC(ip6s_exthdrtoolong);
 					goto bad;
 				}
 			}
 			/* ip6 possibly invalid but not used after. */
 			ni6 = (struct icmp6_nodeinfo *)(mtod(m, caddr_t) + off);
 			subj = (char *)(mtod(m, caddr_t) + off +
 			    sizeof(struct icmp6_nodeinfo));
 			if (!ni6_dnsmatch(subj, subjlen, mtod(n, const char *),
 			    n->m_len)) {
 				goto bad;
 			}
 			m_freem(n);
 			n = NULL;
 			break;
 
 		case ICMP6_NI_SUBJ_IPV4:	/* XXX: to be implemented? */
 		default:
 			goto bad;
 		}
 		break;
 	}
 
 	/* refuse based on configuration.  XXX ICMP6_NI_REFUSED? */
 	switch (qtype) {
 	case NI_QTYPE_FQDN:
 		if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_FQDNOK) == 0)
 			goto bad;
 		break;
 	case NI_QTYPE_NODEADDR:
 	case NI_QTYPE_IPV4ADDR:
 		if ((V_icmp6_nodeinfo & ICMP6_NODEINFO_NODEADDROK) == 0)
 			goto bad;
 		break;
 	}
 
 	/* guess reply length */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		break;		/* no reply data */
 	case NI_QTYPE_SUPTYPES:
 		replylen += sizeof(u_int32_t);
 		break;
 	case NI_QTYPE_FQDN:
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		break;
 	case NI_QTYPE_NODEADDR:
 		addrs = ni6_addrs(ni6, m, &ifp, (struct in6_addr *)subj);
 		if ((replylen += addrs * (sizeof(struct in6_addr) +
 		    sizeof(u_int32_t))) > MCLBYTES)
 			replylen = MCLBYTES; /* XXX: will truncate pkt later */
 		break;
 	case NI_QTYPE_IPV4ADDR:
 		/* unsupported - should respond with unknown Qtype? */
 		break;
 	default:
 		/*
 		 * XXX: We must return a reply with the ICMP6 code
 		 * `unknown Qtype' in this case.  However we regard the case
 		 * as an FQDN query for backward compatibility.
 		 * Older versions set a random value to this field,
 		 * so it rarely varies in the defined qtypes.
 		 * But the mechanism is not reliable...
 		 * maybe we should obsolete older versions.
 		 */
 		qtype = NI_QTYPE_FQDN;
 		/* XXX will append an mbuf */
 		replylen += offsetof(struct ni_reply_fqdn, ni_fqdn_namelen);
 		oldfqdn++;
 		break;
 	}
 
 	/* Allocate an mbuf to reply. */
 	if (replylen > MCLBYTES) {
 		/*
 		 * XXX: should we try to allocate more? But MCLBYTES
 		 * is probably much larger than IPV6_MMTU...
 		 */
 		goto bad;
 	}
 	if (replylen > MHLEN)
 		n = m_getcl(M_NOWAIT, m->m_type, M_PKTHDR);
 	else
 		n = m_gethdr(M_NOWAIT, m->m_type);
 	if (n == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	m_move_pkthdr(n, m); /* just for recvif and FIB */
 	n->m_pkthdr.len = n->m_len = replylen;
 
 	/* copy mbuf header and IPv6 + Node Information base headers */
 	bcopy(mtod(m, caddr_t), mtod(n, caddr_t), sizeof(struct ip6_hdr));
 	nni6 = (struct icmp6_nodeinfo *)(mtod(n, struct ip6_hdr *) + 1);
 	bcopy((caddr_t)ni6, (caddr_t)nni6, sizeof(struct icmp6_nodeinfo));
 
 	/* qtype dependent procedure */
 	switch (qtype) {
 	case NI_QTYPE_NOOP:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = 0;
 		break;
 	case NI_QTYPE_SUPTYPES:
 	{
 		u_int32_t v;
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		nni6->ni_flags = htons(0x0000);	/* raw bitmap */
 		/* supports NOOP, SUPTYPES, FQDN, and NODEADDR */
 		v = (u_int32_t)htonl(0x0000000f);
 		bcopy(&v, nni6 + 1, sizeof(u_int32_t));
 		break;
 	}
 	case NI_QTYPE_FQDN:
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		fqdn = (struct ni_reply_fqdn *)(mtod(n, caddr_t) +
 		    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo));
 		nni6->ni_flags = 0; /* XXX: meaningless TTL */
 		fqdn->ni_fqdn_ttl = 0;	/* ditto. */
 		/*
 		 * XXX do we really have FQDN in hostname?
 		 */
 		mtx_lock(&pr->pr_mtx);
 		n->m_next = ni6_nametodns(pr->pr_hostname,
 		    strlen(pr->pr_hostname), oldfqdn);
 		mtx_unlock(&pr->pr_mtx);
 		if (n->m_next == NULL)
 			goto bad;
 		/* XXX we assume that n->m_next is not a chain */
 		if (n->m_next->m_next != NULL)
 			goto bad;
 		n->m_pkthdr.len += n->m_next->m_len;
 		break;
 	case NI_QTYPE_NODEADDR:
 	{
 		int lenlim, copied;
 
 		nni6->ni_code = ICMP6_NI_SUCCESS;
 		n->m_pkthdr.len = n->m_len =
 		    sizeof(struct ip6_hdr) + sizeof(struct icmp6_nodeinfo);
 		lenlim = M_TRAILINGSPACE(n);
 		copied = ni6_store_addrs(ni6, nni6, ifp, lenlim);
 		/* XXX: reset mbuf length */
 		n->m_pkthdr.len = n->m_len = sizeof(struct ip6_hdr) +
 		    sizeof(struct icmp6_nodeinfo) + copied;
 		break;
 	}
 	default:
 		break;		/* XXX impossible! */
 	}
 
 	nni6->ni_type = ICMP6_NI_REPLY;
 	m_freem(m);
 	return (n);
 
   bad:
 	m_freem(m);
 	if (n)
 		m_freem(n);
 	return (NULL);
 }
 
 /*
  * make a mbuf with DNS-encoded string.  no compression support.
  *
  * XXX names with less than 2 dots (like "foo" or "foo.section") will be
  * treated as truncated name (two \0 at the end).  this is a wild guess.
  *
  * old - return pascal string if non-zero
  */
 static struct mbuf *
 ni6_nametodns(const char *name, int namelen, int old)
 {
 	struct mbuf *m;
 	char *cp, *ep;
 	const char *p, *q;
 	int i, len, nterm;
 
 	if (old)
 		len = namelen + 1;
 	else
 		len = MCLBYTES;
 
 	/* Because MAXHOSTNAMELEN is usually 256, we use cluster mbuf. */
 	if (len > MLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		goto fail;
 
 	if (old) {
 		m->m_len = len;
 		*mtod(m, char *) = namelen;
 		bcopy(name, mtod(m, char *) + 1, namelen);
 		return m;
 	} else {
 		m->m_len = 0;
 		cp = mtod(m, char *);
 		ep = mtod(m, char *) + M_TRAILINGSPACE(m);
 
 		/* if not certain about my name, return empty buffer */
 		if (namelen == 0)
 			return m;
 
 		/*
 		 * guess if it looks like shortened hostname, or FQDN.
 		 * shortened hostname needs two trailing "\0".
 		 */
 		i = 0;
 		for (p = name; p < name + namelen; p++) {
 			if (*p && *p == '.')
 				i++;
 		}
 		if (i < 2)
 			nterm = 2;
 		else
 			nterm = 1;
 
 		p = name;
 		while (cp < ep && p < name + namelen) {
 			i = 0;
 			for (q = p; q < name + namelen && *q && *q != '.'; q++)
 				i++;
 			/* result does not fit into mbuf */
 			if (cp + i + 1 >= ep)
 				goto fail;
 			/*
 			 * DNS label length restriction, RFC1035 page 8.
 			 * "i == 0" case is included here to avoid returning
 			 * 0-length label on "foo..bar".
 			 */
 			if (i <= 0 || i >= 64)
 				goto fail;
 			*cp++ = i;
 			bcopy(p, cp, i);
 			cp += i;
 			p = q;
 			if (p < name + namelen && *p == '.')
 				p++;
 		}
 		/* termination */
 		if (cp + nterm >= ep)
 			goto fail;
 		while (nterm-- > 0)
 			*cp++ = '\0';
 		m->m_len = cp - mtod(m, char *);
 		return m;
 	}
 
 	panic("should not reach here");
 	/* NOTREACHED */
 
  fail:
 	if (m)
 		m_freem(m);
 	return NULL;
 }
 
 /*
  * check if two DNS-encoded string matches.  takes care of truncated
  * form (with \0\0 at the end).  no compression support.
  * XXX upper/lowercase match (see RFC2065)
  */
 static int
 ni6_dnsmatch(const char *a, int alen, const char *b, int blen)
 {
 	const char *a0, *b0;
 	int l;
 
 	/* simplest case - need validation? */
 	if (alen == blen && bcmp(a, b, alen) == 0)
 		return 1;
 
 	a0 = a;
 	b0 = b;
 
 	/* termination is mandatory */
 	if (alen < 2 || blen < 2)
 		return 0;
 	if (a0[alen - 1] != '\0' || b0[blen - 1] != '\0')
 		return 0;
 	alen--;
 	blen--;
 
 	while (a - a0 < alen && b - b0 < blen) {
 		if (a - a0 + 1 > alen || b - b0 + 1 > blen)
 			return 0;
 
 		if ((signed char)a[0] < 0 || (signed char)b[0] < 0)
 			return 0;
 		/* we don't support compression yet */
 		if (a[0] >= 64 || b[0] >= 64)
 			return 0;
 
 		/* truncated case */
 		if (a[0] == 0 && a - a0 == alen - 1)
 			return 1;
 		if (b[0] == 0 && b - b0 == blen - 1)
 			return 1;
 		if (a[0] == 0 || b[0] == 0)
 			return 0;
 
 		if (a[0] != b[0])
 			return 0;
 		l = a[0];
 		if (a - a0 + 1 + l > alen || b - b0 + 1 + l > blen)
 			return 0;
 		if (bcmp(a + 1, b + 1, l) != 0)
 			return 0;
 
 		a += 1 + l;
 		b += 1 + l;
 	}
 
 	if (a - a0 == alen && b - b0 == blen)
 		return 1;
 	else
 		return 0;
 }
 
 /*
  * calculate the number of addresses to be returned in the node info reply.
  */
 static int
 ni6_addrs(struct icmp6_nodeinfo *ni6, struct mbuf *m, struct ifnet **ifpp,
     struct in6_addr *subj)
 {
 	struct ifnet *ifp;
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	int addrs = 0, addrsofif, iffound = 0;
 	int niflags = ni6->ni_flags;
 
 	NET_EPOCH_ASSERT();
 
 	if ((niflags & NI_NODEADDR_FLAG_ALL) == 0) {
 		switch (ni6->ni_code) {
 		case ICMP6_NI_SUBJ_IPV6:
 			if (subj == NULL) /* must be impossible... */
 				return (0);
 			break;
 		default:
 			/*
 			 * XXX: we only support IPv6 subject address for
 			 * this Qtype.
 			 */
 			return (0);
 		}
 	}
 
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		addrsofif = 0;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((niflags & NI_NODEADDR_FLAG_ALL) == 0 &&
 			    IN6_ARE_ADDR_EQUAL(subj, &ifa6->ia_addr.sin6_addr))
 				iffound = 1;
 
 			/*
 			 * IPv4-mapped addresses can only be returned by a
 			 * Node Information proxy, since they represent
 			 * addresses of IPv4-only nodes, which perforce do
 			 * not implement this protocol.
 			 * [icmp-name-lookups-07, Section 5.4]
 			 * So we don't support NI_NODEADDR_FLAG_COMPAT in
 			 * this function at this moment.
 			 */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental.  not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue; /* we need only unicast addresses */
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
 				continue;
 			}
 			addrsofif++; /* count the address */
 		}
 		if (iffound) {
 			*ifpp = ifp;
 			return (addrsofif);
 		}
 
 		addrs += addrsofif;
 	}
 
 	return (addrs);
 }
 
 static int
 ni6_store_addrs(struct icmp6_nodeinfo *ni6, struct icmp6_nodeinfo *nni6,
     struct ifnet *ifp0, int resid)
 {
 	struct ifnet *ifp;
 	struct in6_ifaddr *ifa6;
 	struct ifaddr *ifa;
 	struct ifnet *ifp_dep = NULL;
 	int copied = 0, allow_deprecated = 0;
 	u_char *cp = (u_char *)(nni6 + 1);
 	int niflags = ni6->ni_flags;
 	u_int32_t ltime;
 
 	NET_EPOCH_ASSERT();
 
 	if (ifp0 == NULL && !(niflags & NI_NODEADDR_FLAG_ALL))
 		return (0);	/* needless to copy */
 
 	ifp = ifp0 ? ifp0 : CK_STAILQ_FIRST(&V_ifnet);
   again:
 
 	for (; ifp; ifp = CK_STAILQ_NEXT(ifp, if_link)) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifa6 = (struct in6_ifaddr *)ifa;
 
 			if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) != 0 &&
 			    allow_deprecated == 0) {
 				/*
 				 * prefererred address should be put before
 				 * deprecated addresses.
 				 */
 
 				/* record the interface for later search */
 				if (ifp_dep == NULL)
 					ifp_dep = ifp;
 
 				continue;
 			} else if ((ifa6->ia6_flags & IN6_IFF_DEPRECATED) == 0 &&
 			    allow_deprecated != 0)
 				continue; /* we now collect deprecated addrs */
 
 			/* What do we have to do about ::1? */
 			switch (in6_addrscope(&ifa6->ia_addr.sin6_addr)) {
 			case IPV6_ADDR_SCOPE_LINKLOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_LINKLOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_SITELOCAL:
 				if ((niflags & NI_NODEADDR_FLAG_SITELOCAL) == 0)
 					continue;
 				break;
 			case IPV6_ADDR_SCOPE_GLOBAL:
 				if ((niflags & NI_NODEADDR_FLAG_GLOBAL) == 0)
 					continue;
 				break;
 			default:
 				continue;
 			}
 
 			/*
 			 * check if anycast is okay.
 			 * XXX: just experimental.  not in the spec.
 			 */
 			if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0 &&
 			    (niflags & NI_NODEADDR_FLAG_ANYCAST) == 0)
 				continue;
 			if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (V_icmp6_nodeinfo & ICMP6_NODEINFO_TMPADDROK) == 0) {
 				continue;
 			}
 
 			/* now we can copy the address */
 			if (resid < sizeof(struct in6_addr) +
 			    sizeof(u_int32_t)) {
 				/*
 				 * We give up much more copy.
 				 * Set the truncate flag and return.
 				 */
 				nni6->ni_flags |= NI_NODEADDR_FLAG_TRUNCATE;
 				return (copied);
 			}
 
 			/*
 			 * Set the TTL of the address.
 			 * The TTL value should be one of the following
 			 * according to the specification:
 			 *
 			 * 1. The remaining lifetime of a DHCP lease on the
 			 *    address, or
 			 * 2. The remaining Valid Lifetime of a prefix from
 			 *    which the address was derived through Stateless
 			 *    Autoconfiguration.
 			 *
 			 * Note that we currently do not support stateful
 			 * address configuration by DHCPv6, so the former
 			 * case can't happen.
 			 */
 			if (ifa6->ia6_lifetime.ia6t_expire == 0)
 				ltime = ND6_INFINITE_LIFETIME;
 			else {
 				if (ifa6->ia6_lifetime.ia6t_expire >
 				    time_uptime)
 					ltime = htonl(ifa6->ia6_lifetime.ia6t_expire - time_uptime);
 				else
 					ltime = 0;
 			}
 
 			bcopy(&ltime, cp, sizeof(u_int32_t));
 			cp += sizeof(u_int32_t);
 
 			/* copy the address itself */
 			bcopy(&ifa6->ia_addr.sin6_addr, cp,
 			    sizeof(struct in6_addr));
 			in6_clearscope((struct in6_addr *)cp); /* XXX */
 			cp += sizeof(struct in6_addr);
 
 			resid -= (sizeof(struct in6_addr) + sizeof(u_int32_t));
 			copied += (sizeof(struct in6_addr) + sizeof(u_int32_t));
 		}
 		if (ifp0)	/* we need search only on the specified IF */
 			break;
 	}
 
 	if (allow_deprecated == 0 && ifp_dep != NULL) {
 		ifp = ifp_dep;
 		allow_deprecated = 1;
 
 		goto again;
 	}
 
 	return (copied);
 }
 
 static bool
 icmp6_rip6_match(const struct inpcb *inp, void *v)
 {
 	struct ip6_hdr *ip6 = v;
 
 	if ((inp->inp_vflag & INP_IPV6) == 0)
 		return (false);
 	if (inp->inp_ip_p != IPPROTO_ICMPV6)
 		return (false);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
 	   !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
 		return (false);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 	   !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
 		return (false);
 	return (true);
 }
 
 /*
  * XXX almost dup'ed code with rip6_input.
  */
 static int
 icmp6_rip6_input(struct mbuf **mp, int off)
 {
 	struct mbuf *n, *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
 	    INPLOOKUP_RLOCKPCB, icmp6_rip6_match, ip6);
 	struct inpcb *inp;
 	struct sockaddr_in6 fromsa;
 	struct icmp6_hdr *icmp6;
 	struct mbuf *opts = NULL;
 	int delivered = 0;
 
 	/* This is assumed to be safe; icmp6_input() does a pullup. */
 	icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off);
 
 	/*
 	 * XXX: the address may have embedded scope zone ID, which should be
 	 * hidden from applications.
 	 */
 	bzero(&fromsa, sizeof(fromsa));
 	fromsa.sin6_family = AF_INET6;
 	fromsa.sin6_len = sizeof(struct sockaddr_in6);
 	fromsa.sin6_addr = ip6->ip6_src;
 	if (sa6_recoverscope(&fromsa)) {
 		m_freem(m);
 		*mp = NULL;
 		return (IPPROTO_DONE);
 	}
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type,
 		    inp->in6p_icmp6filt))
 			continue;
 		/*
 		 * Recent network drivers tend to allocate a single
 		 * mbuf cluster, rather than to make a couple of
 		 * mbufs without clusters.  Also, since the IPv6 code
 		 * path tries to avoid m_pullup(), it is highly
 		 * probable that we still have an mbuf cluster here
 		 * even though the necessary length can be stored in an
 		 * mbuf's internal buffer.
 		 * Meanwhile, the default size of the receive socket
 		 * buffer for raw sockets is not so large.  This means
 		 * the possibility of packet loss is relatively higher
 		 * than before.  To avoid this scenario, we copy the
 		 * received data to a separate mbuf that does not use
 		 * a cluster, if possible.
 		 * XXX: it is better to copy the data after stripping
 		 * intermediate headers.
 		 */
 		if ((m->m_flags & M_EXT) && m->m_next == NULL &&
 		    m->m_len <= MHLEN) {
 			n = m_get(M_NOWAIT, m->m_type);
 			if (n != NULL) {
 				if (m_dup_pkthdr(n, m, M_NOWAIT)) {
 					bcopy(m->m_data, n->m_data, m->m_len);
 					n->m_len = m->m_len;
 				} else {
 					m_free(n);
 					n = NULL;
 				}
 			}
 		} else
 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		if (n == NULL)
 			continue;
 		if (inp->inp_flags & INP_CONTROLOPTS)
 			ip6_savecontrol(inp, n, &opts);
 		/* strip intermediate headers */
 		m_adj(n, off);
 		SOCKBUF_LOCK(&inp->inp_socket->so_rcv);
 		if (sbappendaddr_locked(&inp->inp_socket->so_rcv,
 		    (struct sockaddr *)&fromsa, n, opts) == 0) {
 			soroverflow_locked(inp->inp_socket);
 			m_freem(n);
 			if (opts)
 				m_freem(opts);
 		} else {
 			sorwakeup_locked(inp->inp_socket);
 			delivered++;
 		}
 		opts = NULL;
 	}
 	m_freem(m);
 	*mp = NULL;
 	if (delivered == 0)
 		IP6STAT_DEC(ip6s_delivered);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Reflect the ip6 packet back to the source.
  * OFF points to the icmp6 header, counted from the top of the mbuf.
  */
 static void
 icmp6_reflect(struct mbuf *m, size_t off)
 {
 	struct in6_addr src6, *srcp;
 	struct ip6_hdr *ip6;
 	struct icmp6_hdr *icmp6;
 	struct in6_ifaddr *ia = NULL;
 	struct ifnet *outif = NULL;
 	int plen;
 	int type, code, hlim;
 
 	/* too short to reflect */
 	if (off < sizeof(struct ip6_hdr)) {
 		nd6log((LOG_DEBUG,
 		    "sanity fail: off=%lx, sizeof(ip6)=%lx in %s:%d\n",
 		    (u_long)off, (u_long)sizeof(struct ip6_hdr),
 		    __FILE__, __LINE__));
 		goto bad;
 	}
 
 	/*
 	 * If there are extra headers between IPv6 and ICMPv6, strip
 	 * off that header first.
 	 */
 #ifdef DIAGNOSTIC
 	if (sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) > MHLEN)
 		panic("assumption failed in icmp6_reflect");
 #endif
 	if (off > sizeof(struct ip6_hdr)) {
 		size_t l;
 		struct ip6_hdr nip6;
 
 		l = off - sizeof(struct ip6_hdr);
 		m_copydata(m, 0, sizeof(nip6), (caddr_t)&nip6);
 		m_adj(m, l);
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 		bcopy((caddr_t)&nip6, mtod(m, caddr_t), sizeof(nip6));
 	} else /* off == sizeof(struct ip6_hdr) */ {
 		size_t l;
 		l = sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr);
 		if (m->m_len < l) {
 			if ((m = m_pullup(m, l)) == NULL)
 				return;
 		}
 	}
 	plen = m->m_pkthdr.len - sizeof(struct ip6_hdr);
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	icmp6 = (struct icmp6_hdr *)(ip6 + 1);
 	type = icmp6->icmp6_type; /* keep type for statistics */
 	code = icmp6->icmp6_code; /* ditto. */
 	hlim = 0;
 	srcp = NULL;
 
 	/*
 	 * If the incoming packet was addressed directly to us (i.e. unicast),
 	 * use dst as the src for the reply.
 	 * The IN6_IFF_NOTREADY case should be VERY rare, but is possible
 	 * (for example) when we encounter an error while forwarding procedure
 	 * destined to a duplicated address of ours.
 	 */
 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
 		if (ia != NULL && !(ia->ia6_flags &
 		    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY))) {
 			src6 = ia->ia_addr.sin6_addr;
 			srcp = &src6;
 
 			if (m->m_pkthdr.rcvif != NULL) {
 				/* XXX: This may not be the outgoing interface */
 				hlim = ND_IFINFO(m->m_pkthdr.rcvif)->chlim;
 			} else
 				hlim = V_ip6_defhlim;
 		}
 	}
 
 	if (srcp == NULL) {
 		int error;
 		struct in6_addr dst6;
 		uint32_t scopeid;
 
 		/*
 		 * This case matches to multicasts, our anycast, or unicasts
 		 * that we do not own.  Select a source address based on the
 		 * source address of the erroneous packet.
 		 */
 		in6_splitscope(&ip6->ip6_src, &dst6, &scopeid);
 		error = in6_selectsrc_addr(M_GETFIB(m), &dst6,
 		    scopeid, NULL, &src6, &hlim);
 
 		if (error) {
 			char ip6buf[INET6_ADDRSTRLEN];
 			nd6log((LOG_DEBUG,
 			    "icmp6_reflect: source can't be determined: "
 			    "dst=%s, error=%d\n",
 			    ip6_sprintf(ip6buf, &ip6->ip6_dst), error));
 			goto bad;
 		}
 		srcp = &src6;
 	}
 	/*
 	 * ip6_input() drops a packet if its src is multicast.
 	 * So, the src is never multicast.
 	 */
 	ip6->ip6_dst = ip6->ip6_src;
 	ip6->ip6_src = *srcp;
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = hlim;
 
 	icmp6->icmp6_cksum = 0;
 	icmp6->icmp6_cksum = in6_cksum(m, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), plen);
 
 	/*
 	 * XXX option handling
 	 */
 
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 	m->m_pkthdr.rcvif = NULL;
 	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 	if (outif)
 		icmp6_ifoutstat_inc(outif, type, code);
 
 	return;
 
  bad:
 	m_freem(m);
 	return;
 }
 
 static const char *
 icmp6_redirect_diag(struct in6_addr *src6, struct in6_addr *dst6,
     struct in6_addr *tgt6)
 {
 	static char buf[1024];
 	char ip6bufs[INET6_ADDRSTRLEN];
 	char ip6bufd[INET6_ADDRSTRLEN];
 	char ip6buft[INET6_ADDRSTRLEN];
 	snprintf(buf, sizeof(buf), "(src=%s dst=%s tgt=%s)",
 	    ip6_sprintf(ip6bufs, src6), ip6_sprintf(ip6bufd, dst6),
 	    ip6_sprintf(ip6buft, tgt6));
 	return buf;
 }
 
 void
 icmp6_redirect_input(struct mbuf *m, int off)
 {
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6;
 	struct nd_redirect *nd_rd;
 	struct in6_addr src6, redtgt6, reddst6;
 	union nd_opts ndopts;
 	char ip6buf[INET6_ADDRSTRLEN];
 	char *lladdr;
 	int icmp6len, is_onlink, is_router, lladdrlen;
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.rcvif != NULL, ("%s: no rcvif", __func__));
 
 	/* XXX if we are router, we don't update route by icmp6 redirect */
 	if (V_ip6_forwarding)
 		goto freeit;
 	if (!V_icmp6_rediraccept)
 		goto freeit;
 
 	/* RFC 6980: Nodes MUST silently ignore fragments */
 	if(m->m_flags & M_FRAGMENTED)
 		goto freeit;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	icmp6len = ntohs(ip6->ip6_plen);
 	if (m->m_len < off + icmp6len) {
 		m = m_pullup(m, off + icmp6len);
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	nd_rd = (struct nd_redirect *)((caddr_t)ip6 + off);
 
 	ifp = m->m_pkthdr.rcvif;
 	redtgt6 = nd_rd->nd_rd_target;
 	reddst6 = nd_rd->nd_rd_dst;
 
 	if (in6_setscope(&redtgt6, ifp, NULL) ||
 	    in6_setscope(&reddst6, ifp, NULL)) {
 		goto freeit;
 	}
 
 	/* validation */
 	src6 = ip6->ip6_src;
 	if (!IN6_IS_ADDR_LINKLOCAL(&src6)) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect sent from %s rejected; "
 		    "must be from linklocal\n",
 		    ip6_sprintf(ip6buf, &src6)));
 		goto bad;
 	}
 	if (__predict_false(ip6->ip6_hlim != 255)) {
 		ICMP6STAT_INC(icp6s_invlhlim);
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect sent from %s rejected; "
 		    "hlim=%d (must be 255)\n",
 		    ip6_sprintf(ip6buf, &src6), ip6->ip6_hlim));
 		goto bad;
 	}
     {
 	/* ip6->ip6_src must be equal to gw for icmp6->icmp6_reddst */
 	struct nhop_object *nh;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 
 	in6_splitscope(&reddst6, &kdst, &scopeid);
 	NET_EPOCH_ASSERT();
 	nh = fib6_lookup(ifp->if_fib, &kdst, scopeid, 0, 0);
 	if (nh != NULL) {
 		struct in6_addr nh_addr;
 		nh_addr = ifatoia6(nh->nh_ifa)->ia_addr.sin6_addr;
 		if ((nh->nh_flags & NHF_GATEWAY) == 0) {
 			nd6log((LOG_ERR,
 			    "ICMP6 redirect rejected; no route "
 			    "with inet6 gateway found for redirect dst: %s\n",
 			    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			goto bad;
 		}
 
 		/*
 		 * Embed scope zone id into next hop address.
 		 */
 		nh_addr = nh->gw6_sa.sin6_addr;
 
 		if (IN6_ARE_ADDR_EQUAL(&src6, &nh_addr) == 0) {
 			nd6log((LOG_ERR,
 			    "ICMP6 redirect rejected; "
 			    "not equal to gw-for-src=%s (must be same): "
 			    "%s\n",
 			    ip6_sprintf(ip6buf, &nh_addr),
 			    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 			goto bad;
 		}
 	} else {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "no route found for redirect dst: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
     }
 	if (IN6_IS_ADDR_MULTICAST(&reddst6)) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "redirect dst must be unicast: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	is_router = is_onlink = 0;
 	if (IN6_IS_ADDR_LINKLOCAL(&redtgt6))
 		is_router = 1;	/* router case */
 	if (bcmp(&redtgt6, &reddst6, sizeof(redtgt6)) == 0)
 		is_onlink = 1;	/* on-link destination case */
 	if (!is_router && !is_onlink) {
 		nd6log((LOG_ERR,
 		    "ICMP6 redirect rejected; "
 		    "neither router case nor onlink case: %s\n",
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	icmp6len -= sizeof(*nd_rd);
 	nd6_option_init(nd_rd + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO, "%s: invalid ND option, rejected: %s\n",
 		    __func__, icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	lladdr = NULL;
 	lladdrlen = 0;
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO, "%s: lladdrlen mismatch for %s "
 		    "(if %d, icmp6 packet %d): %s\n",
 		    __func__, ip6_sprintf(ip6buf, &redtgt6),
 		    ifp->if_addrlen, lladdrlen - 2,
 		    icmp6_redirect_diag(&src6, &reddst6, &redtgt6)));
 		goto bad;
 	}
 
 	/* Validation passed. */
 
 	/* RFC 2461 8.3 */
 	nd6_cache_lladdr(ifp, &redtgt6, lladdr, lladdrlen, ND_REDIRECT,
 	    is_onlink ? ND_REDIRECT_ONLINK : ND_REDIRECT_ROUTER);
 
 	/*
 	 * Install a gateway route in the better-router case or an interface
 	 * route in the on-link-destination case.
 	 */
 	{
 		struct sockaddr_in6 sdst;
 		struct sockaddr_in6 sgw;
 		struct sockaddr_in6 ssrc;
 		struct sockaddr *gw;
 		int rt_flags;
 		u_int fibnum;
 
 		bzero(&sdst, sizeof(sdst));
 		bzero(&ssrc, sizeof(ssrc));
 		sdst.sin6_family = ssrc.sin6_family = AF_INET6;
 		sdst.sin6_len = ssrc.sin6_len = sizeof(struct sockaddr_in6);
 		bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr));
 		bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr));
 		rt_flags = 0;
 		if (is_router) {
 			bzero(&sgw, sizeof(sgw));
 			sgw.sin6_family = AF_INET6;
 			sgw.sin6_len = sizeof(struct sockaddr_in6);
 			bcopy(&redtgt6, &sgw.sin6_addr,
 				sizeof(struct in6_addr));
 			gw = (struct sockaddr *)&sgw;
 			rt_flags |= RTF_GATEWAY;
 		} else
 			gw = ifp->if_addr->ifa_addr;
 		for (fibnum = 0; fibnum < rt_numfibs; fibnum++)
 			rib_add_redirect(fibnum, (struct sockaddr *)&sdst, gw,
 			    (struct sockaddr *)&ssrc, ifp, rt_flags,
 			    V_icmp6_redirtimeout);
 	}
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	ICMP6STAT_INC(icp6s_badredirect);
 	m_freem(m);
 }
 
 void
 icmp6_redirect_output(struct mbuf *m0, struct nhop_object *nh)
 {
 	struct ifnet *ifp;	/* my outgoing interface */
 	struct in6_addr *ifp_ll6;
 	struct in6_addr *router_ll6;
 	struct ip6_hdr *sip6;	/* m0 as struct ip6_hdr */
 	struct mbuf *m = NULL;	/* newly allocated one */
 	struct m_tag *mtag;
 	struct ip6_hdr *ip6;	/* m as struct ip6_hdr */
 	struct nd_redirect *nd_rd;
 	struct llentry *ln = NULL;
 	size_t maxlen;
 	u_char *p;
 	struct ifnet *outif = NULL;
 	struct sockaddr_in6 src_sa;
 
 	icmp6_errcount(ND_REDIRECT, 0);
 
 	/* if we are not router, we don't send icmp6 redirect */
 	if (!V_ip6_forwarding)
 		goto fail;
 
 	/* sanity check */
 	if (!m0 || !nh || !(NH_IS_VALID(nh)) || !(ifp = nh->nh_ifp))
 		goto fail;
 
 	/*
 	 * Address check:
 	 *  the source address must identify a neighbor, and
 	 *  the destination address must not be a multicast address
 	 *  [RFC 2461, sec 8.2]
 	 */
 	sip6 = mtod(m0, struct ip6_hdr *);
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = sip6->ip6_src;
 	if (nd6_is_addr_neighbor(&src_sa, ifp) == 0)
 		goto fail;
 	if (IN6_IS_ADDR_MULTICAST(&sip6->ip6_dst))
 		goto fail;	/* what should we do here? */
 
 	/* rate limit */
 	if (icmp6_ratelimit(&sip6->ip6_src, ND_REDIRECT, 0))
 		goto fail;
 
 	/*
 	 * Since we are going to append up to 1280 bytes (= IPV6_MMTU),
 	 * we almost always ask for an mbuf cluster for simplicity.
 	 * (MHLEN < IPV6_MMTU is almost always true)
 	 */
 #if IPV6_MMTU >= MCLBYTES
 # error assumption failed about IPV6_MMTU and MCLBYTES
 #endif
 	m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		goto fail;
 	M_SETFIB(m, M_GETFIB(m0));
 	maxlen = M_TRAILINGSPACE(m);
 	maxlen = min(IPV6_MMTU, maxlen);
 	/* just for safety */
 	if (maxlen < sizeof(struct ip6_hdr) + sizeof(struct icmp6_hdr) +
 	    ((sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7)) {
 		goto fail;
 	}
 
 	{
 		/* get ip6 linklocal address for ifp(my outgoing interface). */
 		struct in6_ifaddr *ia;
 		if ((ia = in6ifa_ifpforlinklocal(ifp,
 						 IN6_IFF_NOTREADY|
 						 IN6_IFF_ANYCAST)) == NULL)
 			goto fail;
 		ifp_ll6 = &ia->ia_addr.sin6_addr;
 		/* XXXRW: reference released prematurely. */
 		ifa_free(&ia->ia_ifa);
 	}
 
 	/* get ip6 linklocal address for the router. */
 	if (nh->nh_flags & NHF_GATEWAY) {
 		struct sockaddr_in6 *sin6;
 		sin6 = &nh->gw6_sa;
 		router_ll6 = &sin6->sin6_addr;
 		if (!IN6_IS_ADDR_LINKLOCAL(router_ll6))
 			router_ll6 = (struct in6_addr *)NULL;
 	} else
 		router_ll6 = (struct in6_addr *)NULL;
 
 	/* ip6 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	/* ip6->ip6_plen will be set later */
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	/* ip6->ip6_src must be linklocal addr for my outgoing if. */
 	bcopy(ifp_ll6, &ip6->ip6_src, sizeof(struct in6_addr));
 	bcopy(&sip6->ip6_src, &ip6->ip6_dst, sizeof(struct in6_addr));
 
 	/* ND Redirect */
 	nd_rd = (struct nd_redirect *)(ip6 + 1);
 	nd_rd->nd_rd_type = ND_REDIRECT;
 	nd_rd->nd_rd_code = 0;
 	nd_rd->nd_rd_reserved = 0;
 	if (nh->nh_flags & NHF_GATEWAY) {
 		/*
 		 * nd_rd->nd_rd_target must be a link-local address in
 		 * better router cases.
 		 */
 		if (!router_ll6)
 			goto fail;
 		bcopy(router_ll6, &nd_rd->nd_rd_target,
 		    sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		    sizeof(nd_rd->nd_rd_dst));
 	} else {
 		/* make sure redtgt == reddst */
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_target,
 		    sizeof(nd_rd->nd_rd_target));
 		bcopy(&sip6->ip6_dst, &nd_rd->nd_rd_dst,
 		    sizeof(nd_rd->nd_rd_dst));
 	}
 
 	p = (u_char *)(nd_rd + 1);
 
 	if (!router_ll6)
 		goto nolladdropt;
 
 	{
 		/* target lladdr option */
 		int len;
 		struct nd_opt_hdr *nd_opt;
 		char *lladdr;
 
 		ln = nd6_lookup(router_ll6, LLE_SF(AF_INET6,  0), ifp);
 		if (ln == NULL)
 			goto nolladdropt;
 
 		len = sizeof(*nd_opt) + ifp->if_addrlen;
 		len = (len + 7) & ~7;	/* round by 8 */
 		/* safety check */
 		if (len + (p - (u_char *)ip6) > maxlen) 			
 			goto nolladdropt;
 
 		if (ln->la_flags & LLE_VALID) {
 			nd_opt = (struct nd_opt_hdr *)p;
 			nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
 			nd_opt->nd_opt_len = len >> 3;
 			lladdr = (char *)(nd_opt + 1);
 			bcopy(ln->ll_addr, lladdr, ifp->if_addrlen);
 			p += len;
 		}
 	}
 nolladdropt:
 	if (ln != NULL)
 		LLE_RUNLOCK(ln);
 		
 	m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 	/* just to be safe */
 #ifdef M_DECRYPTED	/*not openbsd*/
 	if (m0->m_flags & M_DECRYPTED)
 		goto noredhdropt;
 #endif
 	if (p - (u_char *)ip6 > maxlen)
 		goto noredhdropt;
 
 	{
 		/* redirected header option */
 		int len;
 		struct nd_opt_rd_hdr *nd_opt_rh;
 
 		/*
 		 * compute the maximum size for icmp6 redirect header option.
 		 * XXX room for auth header?
 		 */
 		len = maxlen - (p - (u_char *)ip6);
 		len &= ~7;
 
 		/* This is just for simplicity. */
 		if (m0->m_pkthdr.len != m0->m_len) {
 			if (m0->m_next) {
 				m_freem(m0->m_next);
 				m0->m_next = NULL;
 			}
 			m0->m_pkthdr.len = m0->m_len;
 		}
 
 		/*
 		 * Redirected header option spec (RFC2461 4.6.3) talks nothing
 		 * about padding/truncate rule for the original IP packet.
 		 * From the discussion on IPv6imp in Feb 1999,
 		 * the consensus was:
 		 * - "attach as much as possible" is the goal
 		 * - pad if not aligned (original size can be guessed by
 		 *   original ip6 header)
 		 * Following code adds the padding if it is simple enough,
 		 * and truncates if not.
 		 */
 		if (m0->m_next || m0->m_pkthdr.len != m0->m_len)
 			panic("assumption failed in %s:%d", __FILE__,
 			    __LINE__);
 
 		if (len - sizeof(*nd_opt_rh) < m0->m_pkthdr.len) {
 			/* not enough room, truncate */
 			m0->m_pkthdr.len = m0->m_len = len -
 			    sizeof(*nd_opt_rh);
 		} else {
 			/* enough room, pad or truncate */
 			size_t extra;
 
 			extra = m0->m_pkthdr.len % 8;
 			if (extra) {
 				/* pad if easy enough, truncate if not */
 				if (8 - extra <= M_TRAILINGSPACE(m0)) {
 					/* pad */
 					m0->m_len += (8 - extra);
 					m0->m_pkthdr.len += (8 - extra);
 				} else {
 					/* truncate */
 					m0->m_pkthdr.len -= extra;
 					m0->m_len -= extra;
 				}
 			}
 			len = m0->m_pkthdr.len + sizeof(*nd_opt_rh);
 			m0->m_pkthdr.len = m0->m_len = len -
 			    sizeof(*nd_opt_rh);
 		}
 
 		nd_opt_rh = (struct nd_opt_rd_hdr *)p;
 		bzero(nd_opt_rh, sizeof(*nd_opt_rh));
 		nd_opt_rh->nd_opt_rh_type = ND_OPT_REDIRECTED_HEADER;
 		nd_opt_rh->nd_opt_rh_len = len >> 3;
 		p += sizeof(*nd_opt_rh);
 		m->m_pkthdr.len = m->m_len = p - (u_char *)ip6;
 
 		/* connect m0 to m */
 		m_tag_delete_chain(m0, NULL);
 		m0->m_flags &= ~M_PKTHDR;
 		m->m_next = m0;
 		m->m_pkthdr.len = m->m_len + m0->m_len;
 		m0 = NULL;
 	}
 noredhdropt:;
 	if (m0) {
 		m_freem(m0);
 		m0 = NULL;
 	}
 
 	/* XXX: clear embedded link IDs in the inner header */
 	in6_clearscope(&sip6->ip6_src);
 	in6_clearscope(&sip6->ip6_dst);
 	in6_clearscope(&nd_rd->nd_rd_target);
 	in6_clearscope(&nd_rd->nd_rd_dst);
 
 	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
 
 	nd_rd->nd_rd_cksum = 0;
 	nd_rd->nd_rd_cksum = in6_cksum(m, IPPROTO_ICMPV6,
 	    sizeof(*ip6), ntohs(ip6->ip6_plen));
 
         if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_get(PACKET_TAG_ND_OUTGOING, sizeof(unsigned short),
 			M_NOWAIT);
 		if (mtag == NULL)
 			goto fail;
 		*(unsigned short *)(mtag + 1) = nd_rd->nd_rd_type;
 		m_tag_prepend(m, mtag);
 	}
 
 	/* send the packet to outside... */
 	ip6_output(m, NULL, NULL, 0, NULL, &outif, NULL);
 	if (outif) {
 		icmp6_ifstat_inc(outif, ifs6_out_msg);
 		icmp6_ifstat_inc(outif, ifs6_out_redirect);
 	}
 	ICMP6STAT_INC(icp6s_outhist[ND_REDIRECT]);
 
 	return;
 
 fail:
 	if (m)
 		m_freem(m);
 	if (m0)
 		m_freem(m0);
 }
 
 /*
  * ICMPv6 socket option processing.
  */
 int
 icmp6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0;
 	int optlen;
 	struct inpcb *inp = sotoinpcb(so);
 	int level, op, optname;
 
 	if (sopt) {
 		level = sopt->sopt_level;
 		op = sopt->sopt_dir;
 		optname = sopt->sopt_name;
 		optlen = sopt->sopt_valsize;
 	} else
 		level = op = optname = optlen = 0;
 
 	if (level != IPPROTO_ICMPV6) {
 		return EINVAL;
 	}
 
 	switch (op) {
 	case PRCO_SETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			struct icmp6_filter ic6f;
 
 			if (optlen != sizeof(ic6f)) {
 				error = EMSGSIZE;
 				break;
 			}
 			error = sooptcopyin(sopt, &ic6f, optlen, optlen);
 			if (error == 0) {
 				INP_WLOCK(inp);
 				*inp->in6p_icmp6filt = ic6f;
 				INP_WUNLOCK(inp);
 			}
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case PRCO_GETOPT:
 		switch (optname) {
 		case ICMP6_FILTER:
 		    {
 			struct icmp6_filter ic6f;
 
 			INP_RLOCK(inp);
 			ic6f = *inp->in6p_icmp6filt;
 			INP_RUNLOCK(inp);
 			error = sooptcopyout(sopt, &ic6f, sizeof(ic6f));
 			break;
 		    }
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Perform rate limit check.
  * Returns 0 if it is okay to send the icmp6 packet.
  * Returns 1 if the router SHOULD NOT send this icmp6 packet due to rate
  * limitation.
  *
  * XXX per-destination/type check necessary?
  *
  * dst - not used at this moment
  * type - not used at this moment
  * code - not used at this moment
  */
 int
 icmp6_ratelimit(const struct in6_addr *dst, const int type,
     const int code)
 {
 	int ret;
 
 	ret = 0;	/* okay to send */
 
 	/* PPS limit */
 	if (!ppsratecheck(&V_icmp6errppslim_last, &V_icmp6errpps_count,
 	    V_icmp6errppslim)) {
 		/* The packet is subject to rate limit */
 		ret++;
 	}
 
 	return ret;
 }
diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c
index 0a00ea6b8be4..27dc3550177c 100644
--- a/sys/netinet6/in6.c
+++ b/sys/netinet6/in6.c
@@ -1,2729 +1,2730 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in.c	8.2 (Berkeley) 11/15/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/errno.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/nhop.h>
 #include <net/if_dl.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_carp.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 
 /*
  * struct in6_ifreq and struct ifreq must be type punnable for common members
  * of ifr_ifru to allow accessors to be shared.
  */
 _Static_assert(offsetof(struct in6_ifreq, ifr_ifru) ==
     offsetof(struct ifreq, ifr_ifru),
     "struct in6_ifreq and struct ifreq are not type punnable");
 
 VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix);
 #define V_icmp6_nodeinfo_oldmcprefix	VNET(icmp6_nodeinfo_oldmcprefix)
 
 /*
  * Definitions of some costant IP6 addresses.
  */
 const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
 const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
 const struct in6_addr in6addr_nodelocal_allnodes =
 	IN6ADDR_NODELOCAL_ALLNODES_INIT;
 const struct in6_addr in6addr_linklocal_allnodes =
 	IN6ADDR_LINKLOCAL_ALLNODES_INIT;
 const struct in6_addr in6addr_linklocal_allrouters =
 	IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
 const struct in6_addr in6addr_linklocal_allv2routers =
 	IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT;
 
 const struct in6_addr in6mask0 = IN6MASK0;
 const struct in6_addr in6mask32 = IN6MASK32;
 const struct in6_addr in6mask64 = IN6MASK64;
 const struct in6_addr in6mask96 = IN6MASK96;
 const struct in6_addr in6mask128 = IN6MASK128;
 
 const struct sockaddr_in6 sa6_any =
 	{ sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 };
 
 static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *,
 	struct in6_aliasreq *, int);
 static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *);
 
 static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int);
 static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *,
     struct in6_aliasreq *, int flags);
 static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int, int);
 static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *,
     struct in6_ifaddr *, int);
 
 static void in6_join_proxy_ndp_mc(struct ifnet *, const struct in6_addr *);
 static void in6_leave_proxy_ndp_mc(struct ifnet *, const struct in6_addr *);
 
 #define ifa2ia6(ifa)	((struct in6_ifaddr *)(ifa))
 #define ia62ifa(ia6)	(&((ia6)->ia_ifa))
 
 void
 in6_newaddrmsg(struct in6_ifaddr *ia, int cmd)
 {
 	struct rt_addrinfo info;
 	struct ifaddr *ifa;
 	struct sockaddr_dl gateway;
 	int fibnum;
 
 	ifa = &ia->ia_ifa;
 
 	/*
 	 * Prepare info data for the host route.
 	 * This code mimics one from ifa_maintain_loopback_route().
 	 */
 	bzero(&info, sizeof(struct rt_addrinfo));
 	info.rti_flags = ifa->ifa_flags | RTF_HOST | RTF_STATIC | RTF_PINNED;
 	info.rti_info[RTAX_DST] = ifa->ifa_addr;
 	info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gateway;
 	link_init_sdl(ifa->ifa_ifp, (struct sockaddr *)&gateway, ifa->ifa_ifp->if_type);
 	if (cmd != RTM_DELETE)
 		info.rti_ifp = V_loif;
 
 	fibnum = ia62ifa(ia)->ifa_ifp->if_fib;
 
 	if (cmd == RTM_ADD) {
 		rt_addrmsg(cmd, &ia->ia_ifa, fibnum);
 		rt_routemsg_info(cmd, &info, fibnum);
 	} else if (cmd == RTM_DELETE) {
 		rt_routemsg_info(cmd, &info, fibnum);
 		rt_addrmsg(cmd, &ia->ia_ifa, fibnum);
 	}
 }
 
 int
 in6_mask2len(struct in6_addr *mask, u_char *lim0)
 {
 	int x = 0, y;
 	u_char *lim = lim0, *p;
 
 	/* ignore the scope_id part */
 	if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask))
 		lim = (u_char *)mask + sizeof(*mask);
 	for (p = (u_char *)mask; p < lim; x++, p++) {
 		if (*p != 0xff)
 			break;
 	}
 	y = 0;
 	if (p < lim) {
 		for (y = 0; y < 8; y++) {
 			if ((*p & (0x80 >> y)) == 0)
 				break;
 		}
 	}
 
 	/*
 	 * when the limit pointer is given, do a stricter check on the
 	 * remaining bits.
 	 */
 	if (p < lim) {
 		if (y != 0 && (*p & (0x00ff >> y)) != 0)
 			return (-1);
 		for (p = p + 1; p < lim; p++)
 			if (*p != 0)
 				return (-1);
 	}
 
 	return x * 8 + y;
 }
 
 #ifdef COMPAT_FREEBSD32
 struct in6_ndifreq32 {
 	char ifname[IFNAMSIZ];
 	uint32_t ifindex;
 };
 #define	SIOCGDEFIFACE32_IN6	_IOWR('i', 86, struct in6_ndifreq32)
 #endif
 
 int
 in6_control(struct socket *so, u_long cmd, void *data,
     struct ifnet *ifp, struct thread *td)
 {
 	struct	in6_ifreq *ifr = (struct in6_ifreq *)data;
 	struct	in6_ifaddr *ia = NULL;
 	struct	in6_aliasreq *ifra = (struct in6_aliasreq *)data;
 	struct sockaddr_in6 *sa6;
 	int error;
 
 	/*
 	 * Compat to make pre-10.x ifconfig(8) operable.
 	 */
 	if (cmd == OSIOCAIFADDR_IN6) {
 		cmd = SIOCAIFADDR_IN6;
 		ifra->ifra_vhid = 0;
 	}
 
 	switch (cmd) {
 	case SIOCGETSGCNT_IN6:
 	case SIOCGETMIFCNT_IN6:
 		/*
 		 * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c.
 		 * We cannot see how that would be needed, so do not adjust the
 		 * KPI blindly; more likely should clean up the IPv4 variant.
 		 */
 		return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP);
 	}
 
 	switch (cmd) {
 	case SIOCAADDRCTL_POLICY:
 	case SIOCDADDRCTL_POLICY:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_ADDRCTRL6);
 			if (error)
 				return (error);
 		}
 		return (in6_src_ioctl(cmd, data));
 	}
 
 	if (ifp == NULL)
 		return (EOPNOTSUPP);
 
 	switch (cmd) {
 	case SIOCSNDFLUSH_IN6:
 	case SIOCSPFXFLUSH_IN6:
 	case SIOCSRTRFLUSH_IN6:
 	case SIOCSDEFIFACE_IN6:
 	case SIOCSIFINFO_FLAGS:
 	case SIOCSIFINFO_IN6:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_ND6);
 			if (error)
 				return (error);
 		}
 		/* FALLTHROUGH */
 	case OSIOCGIFINFO_IN6:
 	case SIOCGIFINFO_IN6:
 	case SIOCGNBRINFO_IN6:
 	case SIOCGDEFIFACE_IN6:
 		return (nd6_ioctl(cmd, data, ifp));
 
 #ifdef COMPAT_FREEBSD32
 	case SIOCGDEFIFACE32_IN6:
 		{
 			struct in6_ndifreq ndif;
 			struct in6_ndifreq32 *ndif32;
 
 			error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif,
 			    ifp);
 			if (error)
 				return (error);
 			ndif32 = (struct in6_ndifreq32 *)data;
 			ndif32->ifindex = ndif.ifindex;
 			return (0);
 		}
 #endif
 	}
 
 	switch (cmd) {
 	case SIOCSIFPREFIX_IN6:
 	case SIOCDIFPREFIX_IN6:
 	case SIOCAIFPREFIX_IN6:
 	case SIOCCIFPREFIX_IN6:
 	case SIOCSGIFPREFIX_IN6:
 	case SIOCGIFPREFIX_IN6:
 		log(LOG_NOTICE,
 		    "prefix ioctls are now invalidated. "
 		    "please use ifconfig.\n");
 		return (EOPNOTSUPP);
 	}
 
 	switch (cmd) {
 	case SIOCSSCOPE6:
 		if (td != NULL) {
 			error = priv_check(td, PRIV_NETINET_SCOPE6);
 			if (error)
 				return (error);
 		}
 		/* FALLTHROUGH */
 	case SIOCGSCOPE6:
 	case SIOCGSCOPE6DEF:
 		return (scope6_ioctl(cmd, data, ifp));
 	}
 
 	/*
 	 * Find address for this interface, if it exists.
 	 *
 	 * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation
 	 * only, and used the first interface address as the target of other
 	 * operations (without checking ifra_addr).  This was because netinet
 	 * code/API assumed at most 1 interface address per interface.
 	 * Since IPv6 allows a node to assign multiple addresses
 	 * on a single interface, we almost always look and check the
 	 * presence of ifra_addr, and reject invalid ones here.
 	 * It also decreases duplicated code among SIOC*_IN6 operations.
 	 */
 	switch (cmd) {
 	case SIOCAIFADDR_IN6:
 	case SIOCSIFPHYADDR_IN6:
 		sa6 = &ifra->ifra_addr;
 		break;
 	case SIOCSIFADDR_IN6:
 	case SIOCGIFADDR_IN6:
 	case SIOCSIFDSTADDR_IN6:
 	case SIOCSIFNETMASK_IN6:
 	case SIOCGIFDSTADDR_IN6:
 	case SIOCGIFNETMASK_IN6:
 	case SIOCDIFADDR_IN6:
 	case SIOCGIFPSRCADDR_IN6:
 	case SIOCGIFPDSTADDR_IN6:
 	case SIOCGIFAFLAG_IN6:
 	case SIOCSNDFLUSH_IN6:
 	case SIOCSPFXFLUSH_IN6:
 	case SIOCSRTRFLUSH_IN6:
 	case SIOCGIFALIFETIME_IN6:
 	case SIOCGIFSTAT_IN6:
 	case SIOCGIFSTAT_ICMP6:
 		sa6 = &ifr->ifr_addr;
 		break;
 	case SIOCSIFADDR:
 	case SIOCSIFBRDADDR:
 	case SIOCSIFDSTADDR:
 	case SIOCSIFNETMASK:
 		/*
 		 * Although we should pass any non-INET6 ioctl requests
 		 * down to driver, we filter some legacy INET requests.
 		 * Drivers trust SIOCSIFADDR et al to come from an already
 		 * privileged layer, and do not perform any credentials
 		 * checks or input validation.
 		 */
 		return (EINVAL);
 	default:
 		sa6 = NULL;
 		break;
 	}
 	if (sa6 && sa6->sin6_family == AF_INET6) {
 		if (sa6->sin6_scope_id != 0)
 			error = sa6_embedscope(sa6, 0);
 		else
 			error = in6_setscope(&sa6->sin6_addr, ifp, NULL);
 		if (error != 0)
 			return (error);
 		if (td != NULL && (error = prison_check_ip6(td->td_ucred,
 		    &sa6->sin6_addr)) != 0)
 			return (error);
 		ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr);
 	} else
 		ia = NULL;
 
 	switch (cmd) {
 	case SIOCSIFADDR_IN6:
 	case SIOCSIFDSTADDR_IN6:
 	case SIOCSIFNETMASK_IN6:
 		/*
 		 * Since IPv6 allows a node to assign multiple addresses
 		 * on a single interface, SIOCSIFxxx ioctls are deprecated.
 		 */
 		/* we decided to obsolete this command (20000704) */
 		error = EINVAL;
 		goto out;
 
 	case SIOCDIFADDR_IN6:
 		/*
 		 * for IPv4, we look for existing in_ifaddr here to allow
 		 * "ifconfig if0 delete" to remove the first IPv4 address on
 		 * the interface.  For IPv6, as the spec allows multiple
 		 * interface address from the day one, we consider "remove the
 		 * first one" semantics to be not preferable.
 		 */
 		if (ia == NULL) {
 			error = EADDRNOTAVAIL;
 			goto out;
 		}
 		/* FALLTHROUGH */
 	case SIOCAIFADDR_IN6:
 		/*
 		 * We always require users to specify a valid IPv6 address for
 		 * the corresponding operation.
 		 */
 		if (ifra->ifra_addr.sin6_family != AF_INET6 ||
 		    ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 
 		if (td != NULL) {
 			error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ?
 			    PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR);
 			if (error)
 				goto out;
 		}
 		/* FALLTHROUGH */
 	case SIOCGIFSTAT_IN6:
 	case SIOCGIFSTAT_ICMP6:
 		if (ifp->if_afdata[AF_INET6] == NULL) {
 			error = EPFNOSUPPORT;
 			goto out;
 		}
 		break;
 
 	case SIOCGIFADDR_IN6:
 		/* This interface is basically deprecated. use SIOCGIFCONF. */
 		/* FALLTHROUGH */
 	case SIOCGIFAFLAG_IN6:
 	case SIOCGIFNETMASK_IN6:
 	case SIOCGIFDSTADDR_IN6:
 	case SIOCGIFALIFETIME_IN6:
 		/* must think again about its semantics */
 		if (ia == NULL) {
 			error = EADDRNOTAVAIL;
 			goto out;
 		}
 		break;
 	}
 
 	switch (cmd) {
 	case SIOCGIFADDR_IN6:
 		ifr->ifr_addr = ia->ia_addr;
 		if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0)
 			goto out;
 		break;
 
 	case SIOCGIFDSTADDR_IN6:
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
 			error = EINVAL;
 			goto out;
 		}
 		ifr->ifr_dstaddr = ia->ia_dstaddr;
 		if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0)
 			goto out;
 		break;
 
 	case SIOCGIFNETMASK_IN6:
 		ifr->ifr_addr = ia->ia_prefixmask;
 		break;
 
 	case SIOCGIFAFLAG_IN6:
 		ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags;
 		break;
 
 	case SIOCGIFSTAT_IN6:
 		COUNTER_ARRAY_COPY(((struct in6_ifextra *)
 		    ifp->if_afdata[AF_INET6])->in6_ifstat,
 		    &ifr->ifr_ifru.ifru_stat,
 		    sizeof(struct in6_ifstat) / sizeof(uint64_t));
 		break;
 
 	case SIOCGIFSTAT_ICMP6:
 		COUNTER_ARRAY_COPY(((struct in6_ifextra *)
 		    ifp->if_afdata[AF_INET6])->icmp6_ifstat,
 		    &ifr->ifr_ifru.ifru_icmp6stat,
 		    sizeof(struct icmp6_ifstat) / sizeof(uint64_t));
 		break;
 
 	case SIOCGIFALIFETIME_IN6:
 		ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime;
 		if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 			time_t maxexpire;
 			struct in6_addrlifetime *retlt =
 			    &ifr->ifr_ifru.ifru_lifetime;
 
 			/*
 			 * XXX: adjust expiration time assuming time_t is
 			 * signed.
 			 */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (ia->ia6_lifetime.ia6t_vltime <
 			    maxexpire - ia->ia6_updatetime) {
 				retlt->ia6t_expire = ia->ia6_updatetime +
 				    ia->ia6_lifetime.ia6t_vltime;
 			} else
 				retlt->ia6t_expire = maxexpire;
 		}
 		if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 			time_t maxexpire;
 			struct in6_addrlifetime *retlt =
 			    &ifr->ifr_ifru.ifru_lifetime;
 
 			/*
 			 * XXX: adjust expiration time assuming time_t is
 			 * signed.
 			 */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (ia->ia6_lifetime.ia6t_pltime <
 			    maxexpire - ia->ia6_updatetime) {
 				retlt->ia6t_preferred = ia->ia6_updatetime +
 				    ia->ia6_lifetime.ia6t_pltime;
 			} else
 				retlt->ia6t_preferred = maxexpire;
 		}
 		break;
 
 	case SIOCAIFADDR_IN6:
 		error = in6_addifaddr(ifp, ifra, ia);
 		ia = NULL;
 		break;
 
 	case SIOCDIFADDR_IN6:
 		in6_purgeifaddr(ia);
 		EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa,
 		    IFADDR_EVENT_DEL);
 		break;
 
 	default:
 		if (ifp->if_ioctl == NULL) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		goto out;
 	}
 
 	error = 0;
 out:
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (error);
 }
 
 static struct in6_multi_mship *
 in6_joingroup_legacy(struct ifnet *ifp, const struct in6_addr *mcaddr,
     int *errorp, int delay)
 {
 	struct in6_multi_mship *imm;
 	int error;
 
 	imm = malloc(sizeof(*imm), M_IP6MADDR, M_NOWAIT);
 	if (imm == NULL) {
 		*errorp = ENOBUFS;
 		return (NULL);
 	}
 
 	delay = (delay * MLD_FASTHZ) / hz;
 
 	error = in6_joingroup(ifp, mcaddr, NULL, &imm->i6mm_maddr, delay);
 	if (error) {
 		*errorp = error;
 		free(imm, M_IP6MADDR);
 		return (NULL);
 	}
 
 	return (imm);
 }
 
 static int
 in6_solicited_node_maddr(struct in6_addr *maddr,
     struct ifnet *ifp, const struct in6_addr *base)
 {
 	int error;
 
 	bzero(maddr, sizeof(struct in6_addr));
 	maddr->s6_addr32[0] = IPV6_ADDR_INT32_MLL;
 	maddr->s6_addr32[2] = htonl(1);
 	maddr->s6_addr32[3] = base->s6_addr32[3];
 	maddr->s6_addr8[12] = 0xff;
 	if ((error = in6_setscope(maddr, ifp, NULL)) != 0) {
 		/* XXX: should not happen */
 		log(LOG_ERR, "%s: in6_setscope failed\n", __func__);
 	}
 
 	return error;
 }
 
 /*
  * Join necessary multicast groups.  Factored out from in6_update_ifa().
  * This entire work should only be done once, for the default FIB.
  */
 static int
 in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol)
 {
 	char ip6buf[INET6_ADDRSTRLEN];
 	struct in6_addr mltaddr;
 	struct in6_multi_mship *imm;
 	int delay, error;
 
 	KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__));
 
 	/* Join solicited multicast addr for new host id. */
 	if ((error = in6_solicited_node_maddr(&mltaddr, ifp,
 	    &ifra->ifra_addr.sin6_addr)) != 0)
 		goto cleanup;
 	delay = error = 0;
 	if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 		/*
 		 * We need a random delay for DAD on the address being
 		 * configured.  It also means delaying transmission of the
 		 * corresponding MLD report to avoid report collision.
 		 * [RFC 4861, Section 6.3.7]
 		 */
 		delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz);
 	}
 	imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr),
 		    if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	*in6m_sol = imm->i6mm_maddr;
 
 	/*
 	 * Join link-local all-nodes address.
 	 */
 	mltaddr = in6addr_linklocal_allnodes;
 	if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0)
 		goto cleanup; /* XXX: should not fail */
 
 	imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr),
 		    if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 
 	/*
 	 * Join node information group address.
 	 */
 	delay = 0;
 	if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 		/*
 		 * The spec does not say anything about delay for this group,
 		 * but the same logic should apply.
 		 */
 		delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz);
 	}
 	if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) {
 		/* XXX jinmei */
 		imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay);
 		if (imm == NULL)
 			nd6log((LOG_WARNING,
 			    "%s: in6_joingroup failed for %s on %s "
 			    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 			    &mltaddr), if_name(ifp), error));
 			/* XXX not very fatal, go on... */
 		else
 			LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	}
 	if (V_icmp6_nodeinfo_oldmcprefix &&
 	    in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) {
 		imm = in6_joingroup_legacy(ifp, &mltaddr, &error, delay);
 		if (imm == NULL)
 			nd6log((LOG_WARNING,
 			    "%s: in6_joingroup failed for %s on %s "
 			    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 			    &mltaddr), if_name(ifp), error));
 			/* XXX not very fatal, go on... */
 		else
 			LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 	}
 
 	/*
 	 * Join interface-local all-nodes address.
 	 * (ff01::1%ifN, and ff01::%ifN/32)
 	 */
 	mltaddr = in6addr_nodelocal_allnodes;
 	if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0)
 		goto cleanup; /* XXX: should not fail */
 
 	imm = in6_joingroup_legacy(ifp, &mltaddr, &error, 0);
 	if (imm == NULL) {
 		nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s "
 		    "(errno=%d)\n", __func__, ip6_sprintf(ip6buf,
 		    &mltaddr), if_name(ifp), error));
 		goto cleanup;
 	}
 	LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
 
 cleanup:
 	return (error);
 }
 
 /*
  * Update parameters of an IPv6 interface address.
  * If necessary, a new entry is created and linked into address chains.
  * This function is separated from in6_control().
  */
 int
 in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	int error, hostIsNew = 0;
 
 	if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0)
 		return (error);
 
 	if (ia == NULL) {
 		hostIsNew = 1;
 		if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL)
 			return (ENOBUFS);
 	}
 
 	error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags);
 	if (error != 0) {
 		if (hostIsNew != 0) {
 			in6_unlink_ifa(ia, ifp);
 			ifa_free(&ia->ia_ifa);
 		}
 		return (error);
 	}
 
 	if (hostIsNew)
 		error = in6_broadcast_ifa(ifp, ifra, ia, flags);
 
 	return (error);
 }
 
 /*
  * Fill in basic IPv6 address request info.
  */
 void
 in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr,
     const struct in6_addr *mask)
 {
 
 	memset(ifra, 0, sizeof(struct in6_aliasreq));
 
 	ifra->ifra_addr.sin6_family = AF_INET6;
 	ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6);
 	if (addr != NULL)
 		ifra->ifra_addr.sin6_addr = *addr;
 
 	ifra->ifra_prefixmask.sin6_family = AF_INET6;
 	ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
 	if (mask != NULL)
 		ifra->ifra_prefixmask.sin6_addr = *mask;
 }
 
 static int
 in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	int plen = -1;
 	struct sockaddr_in6 dst6;
 	struct in6_addrlifetime *lt;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/* Validate parameters */
 	if (ifp == NULL || ifra == NULL) /* this maybe redundant */
 		return (EINVAL);
 
 	/*
 	 * The destination address for a p2p link must have a family
 	 * of AF_UNSPEC or AF_INET6.
 	 */
 	if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
 	    ifra->ifra_dstaddr.sin6_family != AF_INET6 &&
 	    ifra->ifra_dstaddr.sin6_family != AF_UNSPEC)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * Validate address
 	 */
 	if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) ||
 	    ifra->ifra_addr.sin6_family != AF_INET6)
 		return (EINVAL);
 
 	/*
 	 * validate ifra_prefixmask.  don't check sin6_family, netmask
 	 * does not carry fields other than sin6_len.
 	 */
 	if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6))
 		return (EINVAL);
 	/*
 	 * Because the IPv6 address architecture is classless, we require
 	 * users to specify a (non 0) prefix length (mask) for a new address.
 	 * We also require the prefix (when specified) mask is valid, and thus
 	 * reject a non-consecutive mask.
 	 */
 	if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0)
 		return (EINVAL);
 	if (ifra->ifra_prefixmask.sin6_len != 0) {
 		plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
 		    (u_char *)&ifra->ifra_prefixmask +
 		    ifra->ifra_prefixmask.sin6_len);
 		if (plen <= 0)
 			return (EINVAL);
 	} else {
 		/*
 		 * In this case, ia must not be NULL.  We just use its prefix
 		 * length.
 		 */
 		plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);
 	}
 	/*
 	 * If the destination address on a p2p interface is specified,
 	 * and the address is a scoped one, validate/set the scope
 	 * zone identifier.
 	 */
 	dst6 = ifra->ifra_dstaddr;
 	if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 &&
 	    (dst6.sin6_family == AF_INET6)) {
 		struct in6_addr in6_tmp;
 		u_int32_t zoneid;
 
 		in6_tmp = dst6.sin6_addr;
 		if (in6_setscope(&in6_tmp, ifp, &zoneid))
 			return (EINVAL); /* XXX: should be impossible */
 
 		if (dst6.sin6_scope_id != 0) {
 			if (dst6.sin6_scope_id != zoneid)
 				return (EINVAL);
 		} else		/* user omit to specify the ID. */
 			dst6.sin6_scope_id = zoneid;
 
 		/* convert into the internal form */
 		if (sa6_embedscope(&dst6, 0))
 			return (EINVAL); /* XXX: should be impossible */
 	}
 	/* Modify original ifra_dstaddr to reflect changes */
 	ifra->ifra_dstaddr = dst6;
 
 	/*
 	 * The destination address can be specified only for a p2p or a
 	 * loopback interface.  If specified, the corresponding prefix length
 	 * must be 128.
 	 */
 	if (ifra->ifra_dstaddr.sin6_family == AF_INET6) {
 		if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) {
 			/* XXX: noisy message */
 			nd6log((LOG_INFO, "in6_update_ifa: a destination can "
 			    "be specified for a p2p or a loopback IF only\n"));
 			return (EINVAL);
 		}
 		if (plen != 128) {
 			nd6log((LOG_INFO, "in6_update_ifa: prefixlen should "
 			    "be 128 when dstaddr is specified\n"));
 			return (EINVAL);
 		}
 	}
 	/* lifetime consistency check */
 	lt = &ifra->ifra_lifetime;
 	if (lt->ia6t_pltime > lt->ia6t_vltime)
 		return (EINVAL);
 	if (lt->ia6t_vltime == 0) {
 		/*
 		 * the following log might be noisy, but this is a typical
 		 * configuration mistake or a tool's bug.
 		 */
 		nd6log((LOG_INFO,
 		    "in6_update_ifa: valid lifetime is 0 for %s\n",
 		    ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr)));
 
 		if (ia == NULL)
 			return (0); /* there's nothing to do */
 	}
 
 	/* Check prefix mask */
 	if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) {
 		/*
 		 * We prohibit changing the prefix length of an existing
 		 * address, because
 		 * + such an operation should be rare in IPv6, and
 		 * + the operation would confuse prefix management.
 		 */
 		if (ia->ia_prefixmask.sin6_len != 0 &&
 		    in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) {
 			nd6log((LOG_INFO, "in6_validate_ifa: the prefix length "
 			    "of an existing %s address should not be changed\n",
 			    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 
 			return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Allocate a new ifaddr and link it into chains.
  */
 static struct in6_ifaddr *
 in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags)
 {
 	struct in6_ifaddr *ia;
 
 	/*
 	 * When in6_alloc_ifa() is called in a process of a received
 	 * RA, it is called under an interrupt context.  So, we should
 	 * call malloc with M_NOWAIT.
 	 */
 	ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT);
 	if (ia == NULL)
 		return (NULL);
 	LIST_INIT(&ia->ia6_memberships);
 	/* Initialize the address and masks, and put time stamp */
 	ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr;
 	ia->ia_addr.sin6_family = AF_INET6;
 	ia->ia_addr.sin6_len = sizeof(ia->ia_addr);
 	/* XXX: Can we assign ,sin6_addr and skip the rest? */
 	ia->ia_addr = ifra->ifra_addr;
 	ia->ia6_createtime = time_uptime;
 	if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) {
 		/*
 		 * Some functions expect that ifa_dstaddr is not
 		 * NULL for p2p interfaces.
 		 */
 		ia->ia_ifa.ifa_dstaddr =
 		    (struct sockaddr *)&ia->ia_dstaddr;
 	} else {
 		ia->ia_ifa.ifa_dstaddr = NULL;
 	}
 
 	/* set prefix mask if any */
 	ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask;
 	if (ifra->ifra_prefixmask.sin6_len != 0) {
 		ia->ia_prefixmask.sin6_family = AF_INET6;
 		ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len;
 		ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr;
 	}
 
 	ia->ia_ifp = ifp;
 	ifa_ref(&ia->ia_ifa);			/* if_addrhead */
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 
 	ifa_ref(&ia->ia_ifa);			/* in6_ifaddrhead */
 	IN6_IFADDR_WLOCK();
 	CK_STAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link);
 	CK_LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash);
 	IN6_IFADDR_WUNLOCK();
 
 	return (ia);
 }
 
 /*
  * Update/configure interface address parameters:
  *
  * 1) Update lifetime
  * 2) Update interface metric ad flags
  * 3) Notify other subsystems
  */
 static int
 in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int hostIsNew, int flags)
 {
 	int error;
 
 	/* update timestamp */
 	ia->ia6_updatetime = time_uptime;
 
 	/*
 	 * Set lifetimes.  We do not refer to ia6t_expire and ia6t_preferred
 	 * to see if the address is deprecated or invalidated, but initialize
 	 * these members for applications.
 	 */
 	ia->ia6_lifetime = ifra->ifra_lifetime;
 	if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 		ia->ia6_lifetime.ia6t_expire =
 		    time_uptime + ia->ia6_lifetime.ia6t_vltime;
 	} else
 		ia->ia6_lifetime.ia6t_expire = 0;
 	if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 		ia->ia6_lifetime.ia6t_preferred =
 		    time_uptime + ia->ia6_lifetime.ia6t_pltime;
 	} else
 		ia->ia6_lifetime.ia6t_preferred = 0;
 
 	/*
 	 * backward compatibility - if IN6_IFF_DEPRECATED is set from the
 	 * userland, make it deprecated.
 	 */
 	if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) {
 		ia->ia6_lifetime.ia6t_pltime = 0;
 		ia->ia6_lifetime.ia6t_preferred = time_uptime;
 	}
 
 	/*
 	 * configure address flags.
 	 */
 	ia->ia6_flags = ifra->ifra_flags;
 
 	/*
 	 * Make the address tentative before joining multicast addresses,
 	 * so that corresponding MLD responses would not have a tentative
 	 * source address.
 	 */
 	ia->ia6_flags &= ~IN6_IFF_DUPLICATED;	/* safety */
 
 	/*
 	 * DAD should be performed for an new address or addresses on
 	 * an interface with ND6_IFF_IFDISABLED.
 	 */
 	if (in6if_do_dad(ifp) &&
 	    (hostIsNew || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)))
 		ia->ia6_flags |= IN6_IFF_TENTATIVE;
 
 	/* notify other subsystems */
 	error = in6_notify_ifa(ifp, ia, ifra, hostIsNew);
 
 	return (error);
 }
 
 /*
  * Do link-level ifa job:
  * 1) Add lle entry for added address
  * 2) Notifies routing socket users about new address
  * 3) join appropriate multicast group
  * 4) start DAD if enabled
  */
 static int
 in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra,
     struct in6_ifaddr *ia, int flags)
 {
 	struct in6_multi *in6m_sol;
 	int error = 0;
 
 	/* Add local address to lltable, if necessary (ex. on p2p link). */
 	if ((error = nd6_add_ifa_lle(ia)) != 0) {
 		in6_purgeaddr(&ia->ia_ifa);
 		ifa_free(&ia->ia_ifa);
 		return (error);
 	}
 
 	/* Join necessary multicast groups. */
 	in6m_sol = NULL;
 	if ((ifp->if_flags & IFF_MULTICAST) != 0) {
 		error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol);
 		if (error != 0) {
 			in6_purgeaddr(&ia->ia_ifa);
 			ifa_free(&ia->ia_ifa);
 			return (error);
 		}
 	}
 
 	/* Perform DAD, if the address is TENTATIVE. */
 	if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) {
 		int delay, mindelay, maxdelay;
 
 		delay = 0;
 		if ((flags & IN6_IFAUPDATE_DADDELAY)) {
 			/*
 			 * We need to impose a delay before sending an NS
 			 * for DAD.  Check if we also needed a delay for the
 			 * corresponding MLD message.  If we did, the delay
 			 * should be larger than the MLD delay (this could be
 			 * relaxed a bit, but this simple logic is at least
 			 * safe).
 			 * XXX: Break data hiding guidelines and look at
 			 * state for the solicited multicast group.
 			 */
 			mindelay = 0;
 			if (in6m_sol != NULL &&
 			    in6m_sol->in6m_state == MLD_REPORTING_MEMBER) {
 				mindelay = in6m_sol->in6m_timer;
 			}
 			maxdelay = MAX_RTR_SOLICITATION_DELAY * hz;
 			if (maxdelay - mindelay == 0)
 				delay = 0;
 			else {
 				delay =
 				    (arc4random() % (maxdelay - mindelay)) +
 				    mindelay;
 			}
 		}
 		nd6_dad_start((struct ifaddr *)ia, delay);
 	}
 
 	in6_newaddrmsg(ia, RTM_ADD);
 	ifa_free(&ia->ia_ifa);
 	return (error);
 }
 
 /*
  * Adds or deletes interface route for p2p ifa.
  * Returns 0 on success or errno.
  */
 static int
 in6_handle_dstaddr_rtrequest(int cmd, struct in6_ifaddr *ia)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa = &ia->ia_ifa;
 	int error;
 
 	/* Prepare gateway */
 	struct sockaddr_dl_short sdl = {
 		.sdl_family = AF_LINK,
 		.sdl_len = sizeof(struct sockaddr_dl_short),
 		.sdl_type = ifa->ifa_ifp->if_type,
 		.sdl_index = ifa->ifa_ifp->if_index,
 	};
 
 	struct sockaddr_in6 dst = {
 		.sin6_family = AF_INET6,
 		.sin6_len = sizeof(struct sockaddr_in6),
 		.sin6_addr = ia->ia_dstaddr.sin6_addr,
 	};
 
 	struct rt_addrinfo info = {
 		.rti_ifa = ifa,
 		.rti_ifp = ifa->ifa_ifp,
 		.rti_flags = RTF_PINNED | RTF_HOST,
 		.rti_info = {
 			[RTAX_DST] = (struct sockaddr *)&dst,
 			[RTAX_GATEWAY] = (struct sockaddr *)&sdl,
 		},
 	};
 	/* Don't set additional per-gw filters on removal */
 
 	NET_EPOCH_ENTER(et);
 	error = rib_handle_ifaddr_info(ifa->ifa_ifp->if_fib, cmd, &info);
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 static bool
 ifa_is_p2p(struct in6_ifaddr *ia)
 {
 	int plen;
 
 	plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */
 
 	if ((plen == 128) && (ia->ia_dstaddr.sin6_family == AF_INET6) &&
 	    !IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &ia->ia_dstaddr.sin6_addr))
 		return (true);
 
 	return (false);
 }
 
 int
 in6_addifaddr(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia)
 {
 	struct nd_prefixctl pr0;
 	struct nd_prefix *pr;
 	int carp_attached = 0;
 	int error;
 
 	/*
 	 * first, make or update the interface address structure,
 	 * and link it to the list.
 	 */
 	if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0)
 		goto out;
 	if (ia != NULL) {
 		if (ia->ia_ifa.ifa_carp)
 			(*carp_detach_p)(&ia->ia_ifa, true);
 		ifa_free(&ia->ia_ifa);
 	}
 	if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) {
 		/*
 		 * this can happen when the user specify the 0 valid
 		 * lifetime.
 		 */
 		return (0);
 	}
 
 	if (ifra->ifra_vhid > 0) {
 		if (carp_attach_p != NULL)
 			error = (*carp_attach_p)(&ia->ia_ifa,
 			    ifra->ifra_vhid);
 		else
 			error = EPROTONOSUPPORT;
 		if (error)
 			goto out;
 		else
 			carp_attached = 1;
 	}
 
 	/*
 	 * then, make the prefix on-link on the interface.
 	 * XXX: we'd rather create the prefix before the address, but
 	 * we need at least one address to install the corresponding
 	 * interface route, so we configure the address first.
 	 */
 
 	/*
 	 * convert mask to prefix length (prefixmask has already
 	 * been validated in in6_update_ifa().
 	 */
 	bzero(&pr0, sizeof(pr0));
 	pr0.ndpr_ifp = ifp;
 	pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
 	    NULL);
 	if (pr0.ndpr_plen == 128) {
 		/* we don't need to install a host route. */
 		goto aifaddr_out;
 	}
 	pr0.ndpr_prefix = ifra->ifra_addr;
 	/* apply the mask for safety. */
 	IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr,
 	    &ifra->ifra_prefixmask.sin6_addr);
 
 	/*
 	 * XXX: since we don't have an API to set prefix (not address)
 	 * lifetimes, we just use the same lifetimes as addresses.
 	 * The (temporarily) installed lifetimes can be overridden by
 	 * later advertised RAs (when accept_rtadv is non 0), which is
 	 * an intended behavior.
 	 */
 	pr0.ndpr_raf_onlink = 1; /* should be configurable? */
 	pr0.ndpr_raf_auto =
 	    ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0);
 	pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime;
 	pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime;
 
 	/* add the prefix if not yet. */
 	if ((pr = nd6_prefix_lookup(&pr0)) == NULL) {
 		/*
 		 * nd6_prelist_add will install the corresponding
 		 * interface route.
 		 */
 		if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) {
 			if (carp_attached)
 				(*carp_detach_p)(&ia->ia_ifa, false);
 			goto out;
 		}
 	}
 
 	/* relate the address to the prefix */
 	if (ia->ia6_ndpr == NULL) {
 		ia->ia6_ndpr = pr;
 		pr->ndpr_addrcnt++;
 
 		/*
 		 * If this is the first autoconf address from the
 		 * prefix, create a temporary address as well
 		 * (when required).
 		 */
 		if ((ia->ia6_flags & IN6_IFF_AUTOCONF) &&
 		    V_ip6_use_tempaddr && pr->ndpr_addrcnt == 1) {
 			int e;
 			if ((e = in6_tmpifadd(ia, 1, 0)) != 0) {
 				log(LOG_NOTICE, "in6_control: failed "
 				    "to create a temporary address, "
 				    "errno=%d\n", e);
 			}
 		}
 	}
 	nd6_prefix_rele(pr);
 
 	/*
 	 * this might affect the status of autoconfigured addresses,
 	 * that is, this address might make other addresses detached.
 	 */
 	pfxlist_onlink_check();
 
 aifaddr_out:
 	/*
 	 * Try to clear the flag when a new IPv6 address is added
 	 * onto an IFDISABLED interface and it succeeds.
 	 */
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) {
 		struct in6_ndireq nd;
 
 		memset(&nd, 0, sizeof(nd));
 		nd.ndi.flags = ND_IFINFO(ifp)->flags;
 		nd.ndi.flags &= ~ND6_IFF_IFDISABLED;
 		if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0)
 			log(LOG_NOTICE, "SIOCAIFADDR_IN6: "
 			    "SIOCSIFINFO_FLAGS for -ifdisabled "
 			    "failed.");
 		/*
 		 * Ignore failure of clearing the flag intentionally.
 		 * The failure means address duplication was detected.
 		 */
 	}
 	error = 0;
 
 out:
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (error);
 }
 
 void
 in6_purgeaddr(struct ifaddr *ifa)
 {
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa;
 	struct in6_multi_mship *imm;
 	int error;
 
 	if (ifa->ifa_carp)
 		(*carp_detach_p)(ifa, false);
 
 	/*
 	 * Remove the loopback route to the interface address.
 	 * The check for the current setting of "nd6_useloopback"
 	 * is not needed.
 	 */
 	if (ia->ia_flags & IFA_RTSELF) {
 		error = ifa_del_loopback_route((struct ifaddr *)ia,
 		    (struct sockaddr *)&ia->ia_addr);
 		if (error == 0)
 			ia->ia_flags &= ~IFA_RTSELF;
 	}
 
 	/* stop DAD processing */
 	nd6_dad_stop(ifa);
 
 	/* Leave multicast groups. */
 	while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) {
 		LIST_REMOVE(imm, i6mm_chain);
 		if (imm->i6mm_maddr != NULL)
 			in6_leavegroup(imm->i6mm_maddr, NULL);
 		free(imm, M_IP6MADDR);
 	}
 	/* Check if we need to remove p2p route */
 	if ((ia->ia_flags & IFA_ROUTE) && ifa_is_p2p(ia)) {
 		error = in6_handle_dstaddr_rtrequest(RTM_DELETE, ia);
 		if (error != 0)
 			log(LOG_INFO, "%s: err=%d, destination address delete "
 			    "failed\n", __func__, error);
 		ia->ia_flags &= ~IFA_ROUTE;
 	}
 
 	in6_newaddrmsg(ia, RTM_DELETE);
 	in6_unlink_ifa(ia, ifp);
 }
 
 /*
  * Removes @ia from the corresponding interfaces and unlinks corresponding
  *  prefix if no addresses are using it anymore.
  */
 void
 in6_purgeifaddr(struct in6_ifaddr *ia)
 {
 	struct nd_prefix *pr;
 
 	/*
 	 * If the address being deleted is the only one that owns
 	 * the corresponding prefix, expire the prefix as well.
 	 * XXX: theoretically, we don't have to worry about such
 	 * relationship, since we separate the address management
 	 * and the prefix management.  We do this, however, to provide
 	 * as much backward compatibility as possible in terms of
 	 * the ioctl operation.
 	 * Note that in6_purgeaddr() will decrement ndpr_addrcnt.
 	 */
 	pr = ia->ia6_ndpr;
 	in6_purgeaddr(&ia->ia_ifa);
 	if (pr != NULL && pr->ndpr_addrcnt == 0) {
 		ND6_WLOCK();
 		nd6_prefix_unlink(pr, NULL);
 		ND6_WUNLOCK();
 		nd6_prefix_del(pr);
 	}
 }
 
 
 static void
 in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp)
 {
 	char ip6buf[INET6_ADDRSTRLEN];
 	int remove_lle;
 
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifaddr, ifa_link);
 	IF_ADDR_WUNLOCK(ifp);
 	ifa_free(&ia->ia_ifa);			/* if_addrhead */
 
 	/*
 	 * Defer the release of what might be the last reference to the
 	 * in6_ifaddr so that it can't be freed before the remainder of the
 	 * cleanup.
 	 */
 	IN6_IFADDR_WLOCK();
 	CK_STAILQ_REMOVE(&V_in6_ifaddrhead, ia, in6_ifaddr, ia_link);
 	CK_LIST_REMOVE(ia, ia6_hash);
 	IN6_IFADDR_WUNLOCK();
 
 	/*
 	 * Release the reference to the base prefix.  There should be a
 	 * positive reference.
 	 */
 	remove_lle = 0;
 	if (ia->ia6_ndpr == NULL) {
 		nd6log((LOG_NOTICE,
 		    "in6_unlink_ifa: autoconf'ed address "
 		    "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia))));
 	} else {
 		ia->ia6_ndpr->ndpr_addrcnt--;
 		/* Do not delete lles within prefix if refcont != 0 */
 		if (ia->ia6_ndpr->ndpr_addrcnt == 0)
 			remove_lle = 1;
 		ia->ia6_ndpr = NULL;
 	}
 
 	nd6_rem_ifa_lle(ia, remove_lle);
 
 	/*
 	 * Also, if the address being removed is autoconf'ed, call
 	 * pfxlist_onlink_check() since the release might affect the status of
 	 * other (detached) addresses.
 	 */
 	if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) {
 		pfxlist_onlink_check();
 	}
 	ifa_free(&ia->ia_ifa);			/* in6_ifaddrhead */
 }
 
 /*
  * Notifies other subsystems about address change/arrival:
  * 1) Notifies device handler on the first IPv6 address assignment
  * 2) Handle routing table changes for P2P links and route
  * 3) Handle routing table changes for address host route
  */
 static int
 in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia,
     struct in6_aliasreq *ifra, int hostIsNew)
 {
 	int	error = 0, ifacount = 0;
 	struct ifaddr *ifa;
 	struct sockaddr_in6 *pdst;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	/*
 	 * Give the interface a chance to initialize
 	 * if this is its first address,
 	 */
 	if (hostIsNew != 0) {
 		struct epoch_tracker et;
 
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET6)
 				continue;
 			ifacount++;
 		}
 		NET_EPOCH_EXIT(et);
 	}
 
 	if (ifacount <= 1 && ifp->if_ioctl) {
 		error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia);
 		if (error)
 			goto done;
 	}
 
 	/*
 	 * If a new destination address is specified, scrub the old one and
 	 * install the new destination.  Note that the interface must be
 	 * p2p or loopback.
 	 */
 	pdst = &ifra->ifra_dstaddr;
 	if (pdst->sin6_family == AF_INET6 &&
 	    !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) {
 		if ((ia->ia_flags & IFA_ROUTE) != 0 &&
 		    (in6_handle_dstaddr_rtrequest(RTM_DELETE, ia) != 0)) {
 			nd6log((LOG_ERR, "in6_update_ifa_internal: failed to "
 			    "remove a route to the old destination: %s\n",
 			    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 			/* proceed anyway... */
 		} else
 			ia->ia_flags &= ~IFA_ROUTE;
 		ia->ia_dstaddr = *pdst;
 	}
 
 	/*
 	 * If a new destination address is specified for a point-to-point
 	 * interface, install a route to the destination as an interface
 	 * direct route.
 	 * XXX: the logic below rejects assigning multiple addresses on a p2p
 	 * interface that share the same destination.
 	 */
 	if (!(ia->ia_flags & IFA_ROUTE) && ifa_is_p2p(ia)) {
 		error = in6_handle_dstaddr_rtrequest(RTM_ADD, ia);
 		if (error)
 			goto done;
 		ia->ia_flags |= IFA_ROUTE;
 	}
 
 	/*
 	 * add a loopback route to self if not exists
 	 */
 	if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) {
 		error = ifa_add_loopback_route((struct ifaddr *)ia,
 		    (struct sockaddr *)&ia->ia_addr);
 		if (error == 0)
 			ia->ia_flags |= IFA_RTSELF;
 	}
 done:
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "Invoking IPv6 network device address event may sleep");
 
 	ifa_ref(&ia->ia_ifa);
 	EVENTHANDLER_INVOKE(ifaddr_event_ext, ifp, &ia->ia_ifa,
 	    IFADDR_EVENT_ADD);
 	ifa_free(&ia->ia_ifa);
 
 	return (error);
 }
 
 /*
  * Find an IPv6 interface link-local address specific to an interface.
  * ifaddr is returned referenced.
  */
 struct in6_ifaddr *
 in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags)
 {
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) {
 			if ((((struct in6_ifaddr *)ifa)->ia6_flags &
 			    ignoreflags) != 0)
 				continue;
 			ifa_ref(ifa);
 			break;
 		}
 	}
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 /*
  * find the interface address corresponding to a given IPv6 address.
  * ifaddr is returned referenced if @referenced flag is set.
  */
 struct in6_ifaddr *
 in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid, bool referenced)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	CK_LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) {
 			if (zoneid != 0 &&
 			    zoneid != ia->ia_addr.sin6_scope_id)
 				continue;
 			if (referenced)
 				ifa_ref(&ia->ia_ifa);
 			break;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (ia);
 }
 
 /*
  * find the internet address corresponding to a given interface and address.
  * ifaddr is returned referenced.
  */
 struct in6_ifaddr *
 in6ifa_ifpwithaddr(struct ifnet *ifp, const struct in6_addr *addr)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) {
 			ifa_ref(ifa);
 			break;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 /*
  * Find a link-local scoped address on ifp and return it if any.
  */
 struct in6_ifaddr *
 in6ifa_llaonifp(struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct sockaddr_in6 *sin6;
 	struct ifaddr *ifa;
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)
 		return (NULL);
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		sin6 = (struct sockaddr_in6 *)ifa->ifa_addr;
 		if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
 		    IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) ||
 		    IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr))
 			break;
 	}
 	NET_EPOCH_EXIT(et);
 
 	return ((struct in6_ifaddr *)ifa);
 }
 
 /*
  * Convert IP6 address to printable (loggable) representation. Caller
  * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long.
  */
 static char digits[] = "0123456789abcdef";
 char *
 ip6_sprintf(char *ip6buf, const struct in6_addr *addr)
 {
 	int i, cnt = 0, maxcnt = 0, idx = 0, index = 0;
 	char *cp;
 	const u_int16_t *a = (const u_int16_t *)addr;
 	const u_int8_t *d;
 	int dcolon = 0, zero = 0;
 
 	cp = ip6buf;
 
 	for (i = 0; i < 8; i++) {
 		if (*(a + i) == 0) {
 			cnt++;
 			if (cnt == 1)
 				idx = i;
 		}
 		else if (maxcnt < cnt) {
 			maxcnt = cnt;
 			index = idx;
 			cnt = 0;
 		}
 	}
 	if (maxcnt < cnt) {
 		maxcnt = cnt;
 		index = idx;
 	}
 
 	for (i = 0; i < 8; i++) {
 		if (dcolon == 1) {
 			if (*a == 0) {
 				if (i == 7)
 					*cp++ = ':';
 				a++;
 				continue;
 			} else
 				dcolon = 2;
 		}
 		if (*a == 0) {
 			if (dcolon == 0 && *(a + 1) == 0 && i == index) {
 				if (i == 0)
 					*cp++ = ':';
 				*cp++ = ':';
 				dcolon = 1;
 			} else {
 				*cp++ = '0';
 				*cp++ = ':';
 			}
 			a++;
 			continue;
 		}
 		d = (const u_char *)a;
 		/* Try to eliminate leading zeros in printout like in :0001. */
 		zero = 1;
 		*cp = digits[*d >> 4];
 		if (*cp != '0') {
 			zero = 0;
 			cp++;
 		}
 		*cp = digits[*d++ & 0xf];
 		if (zero == 0 || (*cp != '0')) {
 			zero = 0;
 			cp++;
 		}
 		*cp = digits[*d >> 4];
 		if (zero == 0 || (*cp != '0')) {
 			zero = 0;
 			cp++;
 		}
 		*cp++ = digits[*d & 0xf];
 		*cp++ = ':';
 		a++;
 	}
 	*--cp = '\0';
 	return (ip6buf);
 }
 
 int
 in6_localaddr(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6))
 		return 1;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr,
 		    &ia->ia_prefixmask.sin6_addr)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return 1;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 
 	return (0);
 }
 
 /*
  * Return 1 if an internet address is for the local host and configured
  * on one of its interfaces.
  */
 int
 in6_localip(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	CK_LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return (1);
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (0);
 }
 
 /*
  * Like in6_localip(), but FIB-aware.
  */
 bool
 in6_localip_fib(struct in6_addr *in6, uint16_t fib)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	CK_LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr) &&
 		    ia->ia_ifa.ifa_ifp->if_fib == fib) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return (true);
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (false);
 }
 
 /*
  * Return 1 if an internet address is configured on an interface.
  */
 int
 in6_ifhasaddr(struct ifnet *ifp, struct in6_addr *addr)
 {
 	struct in6_addr in6;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ia6;
 
 	NET_EPOCH_ASSERT();
 
 	in6 = *addr;
 	if (in6_clearscope(&in6))
 		return (0);
 	in6_setscope(&in6, ifp, NULL);
 
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ia6 = (struct in6_ifaddr *)ifa;
 		if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &in6))
 			return (1);
 	}
 
 	return (0);
 }
 
 int
 in6_is_addr_deprecated(struct sockaddr_in6 *sa6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	CK_LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) {
 		if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) {
 			if (ia->ia6_flags & IN6_IFF_DEPRECATED) {
 				IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 				return (1); /* true */
 			}
 			break;
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 
 	return (0);		/* false */
 }
 
 /*
  * return length of part which dst and src are equal
  * hard coding...
  */
 int
 in6_matchlen(struct in6_addr *src, struct in6_addr *dst)
 {
 	int match = 0;
 	u_char *s = (u_char *)src, *d = (u_char *)dst;
 	u_char *lim = s + 16, r;
 
 	while (s < lim)
 		if ((r = (*d++ ^ *s++)) != 0) {
 			while (r < 128) {
 				match++;
 				r <<= 1;
 			}
 			break;
 		} else
 			match += 8;
 	return match;
 }
 
 /* XXX: to be scope conscious */
 int
 in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len)
 {
 	int bytelen, bitlen;
 
 	/* sanity check */
 	if (0 > len || len > 128) {
 		log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n",
 		    len);
 		return (0);
 	}
 
 	bytelen = len / 8;
 	bitlen = len % 8;
 
 	if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen))
 		return (0);
 	if (bitlen != 0 &&
 	    p1->s6_addr[bytelen] >> (8 - bitlen) !=
 	    p2->s6_addr[bytelen] >> (8 - bitlen))
 		return (0);
 
 	return (1);
 }
 
 void
 in6_prefixlen2mask(struct in6_addr *maskp, int len)
 {
 	u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
 	int bytelen, bitlen, i;
 
 	/* sanity check */
 	if (0 > len || len > 128) {
 		log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n",
 		    len);
 		return;
 	}
 
 	bzero(maskp, sizeof(*maskp));
 	bytelen = len / 8;
 	bitlen = len % 8;
 	for (i = 0; i < bytelen; i++)
 		maskp->s6_addr[i] = 0xff;
 	if (bitlen)
 		maskp->s6_addr[bytelen] = maskarray[bitlen - 1];
 }
 
 /*
  * return the best address out of the same scope. if no address was
  * found, return the first valid address from designated IF.
  */
 struct in6_ifaddr *
 in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst)
 {
 	int dst_scope =	in6_addrscope(dst), blen = -1, tlen;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *besta = NULL;
 	struct in6_ifaddr *dep[2];	/* last-resort: deprecated */
 
 	NET_EPOCH_ASSERT();
 
 	dep[0] = dep[1] = NULL;
 
 	/*
 	 * We first look for addresses in the same scope.
 	 * If there is one, return it.
 	 * If two or more, return one which matches the dst longest.
 	 * If none, return one of global addresses assigned other ifs.
 	 */
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)
 			continue; /* XXX: is there any case to allow anycast? */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY)
 			continue; /* don't use this interface */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) {
 			if (V_ip6_use_deprecated)
 				dep[0] = (struct in6_ifaddr *)ifa;
 			continue;
 		}
 
 		if (dst_scope == in6_addrscope(IFA_IN6(ifa))) {
 			/*
 			 * call in6_matchlen() as few as possible
 			 */
 			if (besta) {
 				if (blen == -1)
 					blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst);
 				tlen = in6_matchlen(IFA_IN6(ifa), dst);
 				if (tlen > blen) {
 					blen = tlen;
 					besta = (struct in6_ifaddr *)ifa;
 				}
 			} else
 				besta = (struct in6_ifaddr *)ifa;
 		}
 	}
 	if (besta)
 		return (besta);
 
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST)
 			continue; /* XXX: is there any case to allow anycast? */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY)
 			continue; /* don't use this interface */
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED)
 			continue;
 		if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) {
 			if (V_ip6_use_deprecated)
 				dep[1] = (struct in6_ifaddr *)ifa;
 			continue;
 		}
 
 		return (struct in6_ifaddr *)ifa;
 	}
 
 	/* use the last-resort values, that are, deprecated addresses */
 	if (dep[0])
 		return dep[0];
 	if (dep[1])
 		return dep[1];
 
 	return NULL;
 }
 
 /*
  * perform DAD when interface becomes IFF_UP.
  */
 void
 in6_if_up(struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ia;
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ia = (struct in6_ifaddr *)ifa;
 		if (ia->ia6_flags & IN6_IFF_TENTATIVE) {
 			/*
 			 * The TENTATIVE flag was likely set by hand
 			 * beforehand, implicitly indicating the need for DAD.
 			 * We may be able to skip the random delay in this
 			 * case, but we impose delays just in case.
 			 */
 			nd6_dad_start(ifa,
 			    arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz));
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	/*
 	 * special cases, like 6to4, are handled in in6_ifattach
 	 */
 	in6_ifattach(ifp, NULL);
 }
 
 int
 in6if_do_dad(struct ifnet *ifp)
 {
 
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0)
 		return (0);
 	if ((ifp->if_flags & IFF_MULTICAST) == 0)
 		return (0);
 	if ((ND_IFINFO(ifp)->flags &
 	    (ND6_IFF_IFDISABLED | ND6_IFF_NO_DAD)) != 0)
 		return (0);
 	return (1);
 }
 
 /*
  * Calculate max IPv6 MTU through all the interfaces and store it
  * to in6_maxmtu.
  */
 void
 in6_setmaxmtu(void)
 {
 	struct epoch_tracker et;
 	unsigned long maxmtu = 0;
 	struct ifnet *ifp;
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		/* this function can be called during ifnet initialization */
 		if (!ifp->if_afdata[AF_INET6])
 			continue;
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
 		    IN6_LINKMTU(ifp) > maxmtu)
 			maxmtu = IN6_LINKMTU(ifp);
 	}
 	NET_EPOCH_EXIT(et);
 	if (maxmtu)	/* update only when maxmtu is positive */
 		V_in6_maxmtu = maxmtu;
 }
 
 /*
  * Provide the length of interface identifiers to be used for the link attached
  * to the given interface.  The length should be defined in "IPv6 over
  * xxx-link" document.  Note that address architecture might also define
  * the length for a particular set of address prefixes, regardless of the
  * link type.  As clarified in rfc2462bis, those two definitions should be
  * consistent, and those really are as of August 2004.
  */
 int
 in6_if2idlen(struct ifnet *ifp)
 {
 	switch (ifp->if_type) {
 	case IFT_ETHER:		/* RFC2464 */
 	case IFT_PROPVIRTUAL:	/* XXX: no RFC. treat it as ether */
 	case IFT_L2VLAN:	/* ditto */
 	case IFT_BRIDGE:	/* bridge(4) only does Ethernet-like links */
 	case IFT_INFINIBAND:
 		return (64);
 	case IFT_PPP:		/* RFC2472 */
 		return (64);
 	case IFT_FRELAY:	/* RFC2590 */
 		return (64);
 	case IFT_IEEE1394:	/* RFC3146 */
 		return (64);
 	case IFT_GIF:
 		return (64);	/* draft-ietf-v6ops-mech-v2-07 */
 	case IFT_LOOP:
 		return (64);	/* XXX: is this really correct? */
 	default:
 		/*
 		 * Unknown link type:
 		 * It might be controversial to use the today's common constant
 		 * of 64 for these cases unconditionally.  For full compliance,
 		 * we should return an error in this case.  On the other hand,
 		 * if we simply miss the standard for the link type or a new
 		 * standard is defined for a new link type, the IFID length
 		 * is very likely to be the common constant.  As a compromise,
 		 * we always use the constant, but make an explicit notice
 		 * indicating the "unknown" case.
 		 */
 		printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type);
 		return (64);
 	}
 }
 
 struct in6_llentry {
 	struct llentry		base;
 };
 
 #define	IN6_LLTBL_DEFAULT_HSIZE	32
 #define	IN6_LLTBL_HASH(k, h) \
 	(((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1))
 
 /*
  * Do actual deallocation of @lle.
  */
 static void
 in6_lltable_destroy_lle_unlocked(epoch_context_t ctx)
 {
 	struct llentry *lle;
 
 	lle = __containerof(ctx, struct llentry, lle_epoch_ctx);
 	LLE_LOCK_DESTROY(lle);
 	LLE_REQ_DESTROY(lle);
 	free(lle, M_LLTABLE);
 }
 
 /*
  * Called by LLE_FREE_LOCKED when number of references
  * drops to zero.
  */
 static void
 in6_lltable_destroy_lle(struct llentry *lle)
 {
 
 	LLE_WUNLOCK(lle);
 	NET_EPOCH_CALL(in6_lltable_destroy_lle_unlocked, &lle->lle_epoch_ctx);
 }
 
 static struct llentry *
 in6_lltable_new(const struct in6_addr *addr6, u_int flags)
 {
 	struct in6_llentry *lle;
 
 	lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO);
 	if (lle == NULL)		/* NB: caller generates msg */
 		return NULL;
 
 	lle->base.r_l3addr.addr6 = *addr6;
 	lle->base.lle_refcnt = 1;
 	lle->base.lle_free = in6_lltable_destroy_lle;
 	LLE_LOCK_INIT(&lle->base);
 	LLE_REQ_INIT(&lle->base);
 	callout_init(&lle->base.lle_timer, 1);
 
 	return (&lle->base);
 }
 
 static int
 in6_lltable_match_prefix(const struct sockaddr *saddr,
     const struct sockaddr *smask, u_int flags, struct llentry *lle)
 {
 	const struct in6_addr *addr, *mask, *lle_addr;
 
 	addr = &((const struct sockaddr_in6 *)saddr)->sin6_addr;
 	mask = &((const struct sockaddr_in6 *)smask)->sin6_addr;
 	lle_addr = &lle->r_l3addr.addr6;
 
 	if (IN6_ARE_MASKED_ADDR_EQUAL(lle_addr, addr, mask) == 0)
 		return (0);
 
 	if (lle->la_flags & LLE_IFADDR) {
 		/*
 		 * Delete LLE_IFADDR records IFF address & flag matches.
 		 * Note that addr is the interface address within prefix
 		 * being matched.
 		 */
 		if (IN6_ARE_ADDR_EQUAL(addr, lle_addr) &&
 		    (flags & LLE_STATIC) != 0)
 			return (1);
 		return (0);
 	}
 
 	/* flags & LLE_STATIC means deleting both dynamic and static entries */
 	if ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))
 		return (1);
 
 	return (0);
 }
 
 static void
 in6_lltable_free_entry(struct lltable *llt, struct llentry *lle)
 {
 	struct ifnet *ifp __diagused;
 
 	LLE_WLOCK_ASSERT(lle);
 	KASSERT(llt != NULL, ("lltable is NULL"));
 
 	/* Unlink entry from table */
 	if ((lle->la_flags & LLE_LINKED) != 0) {
 		ifp = llt->llt_ifp;
 		IF_AFDATA_WLOCK_ASSERT(ifp);
 		lltable_unlink_entry(llt, lle);
 	}
 
 	llentry_free(lle);
 }
 
 static int
 in6_lltable_rtcheck(struct ifnet *ifp,
 		    u_int flags,
 		    const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in6 *sin6;
 	struct nhop_object *nh;
 	struct in6_addr dst;
 	uint32_t scopeid;
 	char ip6buf[INET6_ADDRSTRLEN];
 	int fibnum;
 
 	NET_EPOCH_ASSERT();
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 
 	sin6 = (const struct sockaddr_in6 *)l3addr;
 	in6_splitscope(&sin6->sin6_addr, &dst, &scopeid);
 	fibnum = V_rt_add_addr_allfibs ? RT_DEFAULT_FIB : ifp->if_fib;
 	nh = fib6_lookup(fibnum, &dst, scopeid, NHR_NONE, 0);
 	if (nh && ((nh->nh_flags & NHF_GATEWAY) || nh->nh_ifp != ifp)) {
 		struct ifaddr *ifa;
 		/*
 		 * Create an ND6 cache for an IPv6 neighbor
 		 * that is not covered by our own prefix.
 		 */
 		ifa = ifaof_ifpforaddr(l3addr, ifp);
 		if (ifa != NULL) {
 			return 0;
 		}
 		log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n",
 		    ip6_sprintf(ip6buf, &sin6->sin6_addr));
 		return EINVAL;
 	}
 	return 0;
 }
 
 static inline uint32_t
 in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize)
 {
 
 	return (IN6_LLTBL_HASH(dst->s6_addr32[3], hsize));
 }
 
 static uint32_t
 in6_lltable_hash(const struct llentry *lle, uint32_t hsize)
 {
 
 	return (in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize));
 }
 
 static void
 in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
 {
 	struct sockaddr_in6 *sin6;
 
 	sin6 = (struct sockaddr_in6 *)sa;
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
 	sin6->sin6_addr = lle->r_l3addr.addr6;
 }
 
 static inline struct llentry *
 in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst)
 {
 	struct llentry *lle;
 	struct llentries *lleh;
 	u_int hashidx;
 
 	hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize);
 	lleh = &llt->lle_head[hashidx];
 	CK_LIST_FOREACH(lle, lleh, lle_next) {
 		if (lle->la_flags & LLE_DELETED)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst))
 			break;
 	}
 
 	return (lle);
 }
 
 static void
 in6_lltable_delete_entry(struct lltable *llt, struct llentry *lle)
 {
 
 	lle->la_flags |= LLE_DELETED;
 
 	/* Leave the solicited multicast group. */
 	if ((lle->la_flags & LLE_PUB) != 0)
 		in6_leave_proxy_ndp_mc(llt->llt_ifp, &lle->r_l3addr.addr6);
 	EVENTHANDLER_INVOKE(lle_event, lle, LLENTRY_DELETED);
 #ifdef DIAGNOSTIC
 	log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle);
 #endif
 	llentry_free(lle);
 }
 
 static struct llentry *
 in6_lltable_alloc(struct lltable *llt, u_int flags,
 	const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
 	struct ifnet *ifp = llt->llt_ifp;
 	struct llentry *lle;
 	char linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	int lladdr_off;
 
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 
 	/*
 	 * A route that covers the given address must have
 	 * been installed 1st because we are doing a resolution,
 	 * verify this.
 	 */
 	if (!(flags & LLE_IFADDR) &&
 	    in6_lltable_rtcheck(ifp, flags, l3addr) != 0)
 		return (NULL);
 
 	lle = in6_lltable_new(&sin6->sin6_addr, flags);
 	if (lle == NULL) {
 		log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
 		return (NULL);
 	}
 	lle->la_flags = flags;
 	if ((flags & LLE_IFADDR) == LLE_IFADDR) {
 		linkhdrsize = LLE_MAX_LINKHDR;
 		if (lltable_calc_llheader(ifp, AF_INET6, IF_LLADDR(ifp),
 		    linkhdr, &linkhdrsize, &lladdr_off) != 0) {
 			in6_lltable_free_entry(llt, lle);
 			return (NULL);
 		}
 		lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize,
 		    lladdr_off);
 		lle->la_flags |= LLE_STATIC;
 	}
 
 	if ((lle->la_flags & LLE_STATIC) != 0)
 		lle->ln_state = ND6_LLINFO_REACHABLE;
 
 	return (lle);
 }
 
 static struct llentry *
 in6_lltable_lookup(struct lltable *llt, u_int flags,
 	const struct sockaddr *l3addr)
 {
 	const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
 	int family = flags >> 16;
 	struct llentry *lle;
 
 	IF_AFDATA_LOCK_ASSERT(llt->llt_ifp);
 	KASSERT(l3addr->sa_family == AF_INET6,
 	    ("sin_family %d", l3addr->sa_family));
 	KASSERT((flags & (LLE_UNLOCKED | LLE_EXCLUSIVE)) !=
 	    (LLE_UNLOCKED | LLE_EXCLUSIVE),
 	    ("wrong lle request flags: %#x", flags));
 
 	lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);
 
 	if (__predict_false(family != AF_INET6))
 		lle = llentry_lookup_family(lle, family);
 
 	if (lle == NULL)
 		return (NULL);
 
 	if (flags & LLE_UNLOCKED)
 		return (lle);
 
 	if (flags & LLE_EXCLUSIVE)
 		LLE_WLOCK(lle);
 	else
 		LLE_RLOCK(lle);
 
 	/*
 	 * If the afdata lock is not held, the LLE may have been unlinked while
 	 * we were blocked on the LLE lock.  Check for this case.
 	 */
 	if (__predict_false((lle->la_flags & LLE_LINKED) == 0)) {
 		if (flags & LLE_EXCLUSIVE)
 			LLE_WUNLOCK(lle);
 		else
 			LLE_RUNLOCK(lle);
 		return (NULL);
 	}
 	return (lle);
 }
 
 static int
 in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle,
     struct sysctl_req *wr)
 {
 	struct ifnet *ifp = llt->llt_ifp;
 	/* XXX stack use */
 	struct {
 		struct rt_msghdr	rtm;
 		struct sockaddr_in6	sin6;
 		/*
 		 * ndp.c assumes that sdl is word aligned
 		 */
 #ifdef __LP64__
 		uint32_t		pad;
 #endif
 		struct sockaddr_dl	sdl;
 	} ndpc;
 	struct sockaddr_dl *sdl;
 	int error;
 
 	bzero(&ndpc, sizeof(ndpc));
 	/* skip deleted entries */
 	if ((lle->la_flags & LLE_DELETED) == LLE_DELETED)
 		return (0);
 	/* Skip if jailed and not a valid IP of the prison. */
 	lltable_fill_sa_entry(lle, (struct sockaddr *)&ndpc.sin6);
 	if (prison_if(wr->td->td_ucred, (struct sockaddr *)&ndpc.sin6) != 0)
 		return (0);
 	/*
 	 * produce a msg made of:
 	 *  struct rt_msghdr;
 	 *  struct sockaddr_in6 (IPv6)
 	 *  struct sockaddr_dl;
 	 */
 	ndpc.rtm.rtm_msglen = sizeof(ndpc);
 	ndpc.rtm.rtm_version = RTM_VERSION;
 	ndpc.rtm.rtm_type = RTM_GET;
 	ndpc.rtm.rtm_flags = RTF_UP;
 	ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY;
 	sa6_recoverscope(&ndpc.sin6);
 
 	/* publish */
 	if (lle->la_flags & LLE_PUB)
 		ndpc.rtm.rtm_flags |= RTF_ANNOUNCE;
 
 	sdl = &ndpc.sdl;
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_len = sizeof(*sdl);
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = ifp->if_type;
 	if ((lle->la_flags & LLE_VALID) == LLE_VALID) {
 		sdl->sdl_alen = ifp->if_addrlen;
 		bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen);
 	} else {
 		sdl->sdl_alen = 0;
 		bzero(LLADDR(sdl), ifp->if_addrlen);
 	}
 	if (lle->la_expire != 0)
 		ndpc.rtm.rtm_rmx.rmx_expire = lle->la_expire +
 		    lle->lle_remtime / hz + time_second - time_uptime;
 	ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA);
 	if (lle->la_flags & LLE_STATIC)
 		ndpc.rtm.rtm_flags |= RTF_STATIC;
 	if (lle->la_flags & LLE_IFADDR)
 		ndpc.rtm.rtm_flags |= RTF_PINNED;
 	if (lle->ln_router != 0)
 		ndpc.rtm.rtm_flags |= RTF_GATEWAY;
 	ndpc.rtm.rtm_rmx.rmx_pksent = lle->la_asked;
 	/* Store state in rmx_weight value */
 	ndpc.rtm.rtm_rmx.rmx_state = lle->ln_state;
 	ndpc.rtm.rtm_index = ifp->if_index;
 	error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc));
 
 	return (error);
 }
 
 static void
 in6_lltable_post_resolved(struct lltable *llt, struct llentry *lle)
 {
 	/* Join the solicited multicast group for dst. */
 	if ((lle->la_flags & LLE_PUB) == LLE_PUB)
 		in6_join_proxy_ndp_mc(llt->llt_ifp, &lle->r_l3addr.addr6);
 }
 
 static struct lltable *
 in6_lltattach(struct ifnet *ifp)
 {
 	struct lltable *llt;
 
 	llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE);
 	llt->llt_af = AF_INET6;
 	llt->llt_ifp = ifp;
 
 	llt->llt_lookup = in6_lltable_lookup;
 	llt->llt_alloc_entry = in6_lltable_alloc;
 	llt->llt_delete_entry = in6_lltable_delete_entry;
 	llt->llt_dump_entry = in6_lltable_dump_entry;
 	llt->llt_hash = in6_lltable_hash;
 	llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry;
 	llt->llt_free_entry = in6_lltable_free_entry;
 	llt->llt_match_prefix = in6_lltable_match_prefix;
 	llt->llt_mark_used = llentry_mark_used;
 	llt->llt_post_resolved = in6_lltable_post_resolved;
  	lltable_link(llt);
 
 	return (llt);
 }
 
 struct lltable *
 in6_lltable_get(struct ifnet *ifp)
 {
 	struct lltable *llt = NULL;
 
 	void *afdata_ptr = ifp->if_afdata[AF_INET6];
 	if (afdata_ptr != NULL)
 		llt = ((struct in6_ifextra *)afdata_ptr)->lltable;
 	return (llt);
 }
 
 void *
 in6_domifattach(struct ifnet *ifp)
 {
 	struct in6_ifextra *ext;
 
 	/* There are not IPv6-capable interfaces. */
 	switch (ifp->if_type) {
 	case IFT_PFLOG:
 	case IFT_PFSYNC:
 	case IFT_USB:
 		return (NULL);
 	}
 	ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK);
 	bzero(ext, sizeof(*ext));
 
 	ext->in6_ifstat = malloc(sizeof(counter_u64_t) *
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK);
 	COUNTER_ARRAY_ALLOC(ext->in6_ifstat,
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK);
 
 	ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) *
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR,
 	    M_WAITOK);
 	COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat,
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK);
 
 	ext->nd_ifinfo = nd6_ifattach(ifp);
 	ext->scope6_id = scope6_ifattach(ifp);
 	ext->lltable = in6_lltattach(ifp);
 
 	ext->mld_ifinfo = mld_domifattach(ifp);
 
 	return ext;
 }
 
 int
 in6_domifmtu(struct ifnet *ifp)
 {
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return ifp->if_mtu;
 
 	return (IN6_LINKMTU(ifp));
 }
 
 void
 in6_domifdetach(struct ifnet *ifp, void *aux)
 {
 	struct in6_ifextra *ext = (struct in6_ifextra *)aux;
 
 	mld_domifdetach(ifp);
 	scope6_ifdetach(ext->scope6_id);
 	nd6_ifdetach(ifp, ext->nd_ifinfo);
 	lltable_free(ext->lltable);
 	COUNTER_ARRAY_FREE(ext->in6_ifstat,
 	    sizeof(struct in6_ifstat) / sizeof(uint64_t));
 	free(ext->in6_ifstat, M_IFADDR);
 	COUNTER_ARRAY_FREE(ext->icmp6_ifstat,
 	    sizeof(struct icmp6_ifstat) / sizeof(uint64_t));
 	free(ext->icmp6_ifstat, M_IFADDR);
 	free(ext, M_IFADDR);
 }
 
 /*
  * Convert sockaddr_in6 to sockaddr_in.  Original sockaddr_in6 must be
  * v4 mapped addr or v4 compat addr
  */
 void
 in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
 {
 
 	bzero(sin, sizeof(*sin));
 	sin->sin_len = sizeof(struct sockaddr_in);
 	sin->sin_family = AF_INET;
 	sin->sin_port = sin6->sin6_port;
 	sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3];
 }
 
 /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */
 void
 in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
 {
 	bzero(sin6, sizeof(*sin6));
 	sin6->sin6_len = sizeof(struct sockaddr_in6);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_port = sin->sin_port;
 	sin6->sin6_addr.s6_addr32[0] = 0;
 	sin6->sin6_addr.s6_addr32[1] = 0;
 	sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP;
 	sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr;
 }
 
 /* Convert sockaddr_in6 into sockaddr_in. */
 void
 in6_sin6_2_sin_in_sock(struct sockaddr *nam)
 {
 	struct sockaddr_in *sin_p;
 	struct sockaddr_in6 sin6;
 
 	/*
 	 * Save original sockaddr_in6 addr and convert it
 	 * to sockaddr_in.
 	 */
 	sin6 = *(struct sockaddr_in6 *)nam;
 	sin_p = (struct sockaddr_in *)nam;
 	in6_sin6_2_sin(sin_p, &sin6);
 }
 
 /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */
 void
 in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam)
 {
 	struct sockaddr_in *sin_p;
 	struct sockaddr_in6 *sin6_p;
 
 	sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK);
 	sin_p = (struct sockaddr_in *)*nam;
 	in6_sin_2_v4mapsin6(sin_p, sin6_p);
 	free(*nam, M_SONAME);
 	*nam = (struct sockaddr *)sin6_p;
 }
 
 /*
  * Join/leave the solicited multicast groups for proxy NDP entries.
  */
 static void
 in6_join_proxy_ndp_mc(struct ifnet *ifp, const struct in6_addr *dst)
 {
 	struct in6_multi *inm;
 	struct in6_addr mltaddr;
 	char ip6buf[INET6_ADDRSTRLEN];
 	int error;
 
 	if (in6_solicited_node_maddr(&mltaddr, ifp, dst) != 0)
 		return;	/* error logged in in6_solicited_node_maddr. */
 
 	error = in6_joingroup(ifp, &mltaddr, NULL, &inm, 0);
 	if (error != 0) {
 		nd6log((LOG_WARNING,
 		    "%s: in6_joingroup failed for %s on %s (errno=%d)\n",
 		    __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp),
 		    error));
 	}
 }
 
 static void
 in6_leave_proxy_ndp_mc(struct ifnet *ifp, const struct in6_addr *dst)
 {
 	struct epoch_tracker et;
 	struct in6_multi *inm;
 	struct in6_addr mltaddr;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	if (in6_solicited_node_maddr(&mltaddr, ifp, dst) != 0)
 		return;	/* error logged in in6_solicited_node_maddr. */
 
 	NET_EPOCH_ENTER(et);
 	inm = in6m_lookup(ifp, &mltaddr);
 	NET_EPOCH_EXIT(et);
 	if (inm != NULL)
 		in6_leavegroup(inm, NULL);
 	else
 		nd6log((LOG_WARNING, "%s: in6m_lookup failed for %s on %s\n",
 		    __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp)));
 }
 
 static bool
 in6_lle_match_pub(struct lltable *llt, struct llentry *lle, void *farg)
 {
 	return ((lle->la_flags & LLE_PUB) != 0);
 }
 
 void
 in6_purge_proxy_ndp(struct ifnet *ifp)
 {
 	struct lltable *llt;
 	bool need_purge;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 
 	llt = LLTABLE6(ifp);
 	IF_AFDATA_WLOCK(ifp);
 	need_purge = ((llt->llt_flags & LLT_ADDEDPROXY) != 0);
 	IF_AFDATA_WUNLOCK(ifp);
 
 	/*
 	 * Ever added proxy ndp entries, leave solicited node multicast
 	 * before deleting the llentry.
 	 */
 	if (need_purge)
 		lltable_delete_conditional(llt, in6_lle_match_pub, NULL);
 }
diff --git a/sys/netinet6/in6_fib.c b/sys/netinet6/in6_fib.c
index 8a0760aff02a..34aff5feda8e 100644
--- a/sys/netinet6/in6_fib.c
+++ b/sys/netinet6/in6_fib.c
@@ -1,349 +1,350 @@
 /*-
  * Copyright (c) 2015
  * 	Alexander V. Chernikov <melifaro@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/rmlock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/fib_algo.h>
 #include <net/route/nhop.h>
 #include <net/toeplitz.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_mroute.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/scope6_var.h>
 
 #include <net/if_types.h>
 
 #ifdef INET6
 
 CHK_STRUCT_ROUTE_COMPAT(struct route_in6, ro_dst);
 
 #ifdef FIB_ALGO
 VNET_DEFINE(struct fib_dp *, inet6_dp);
 #endif
 
 #ifdef ROUTE_MPATH
 struct _hash_5tuple_ipv6 {
 	struct in6_addr src;
 	struct in6_addr dst;
 	unsigned short src_port;
 	unsigned short dst_port;
 	char proto;
 	char spare[3];
 };
 _Static_assert(sizeof(struct _hash_5tuple_ipv6) == 40,
     "_hash_5tuple_ipv6 size is wrong");
 
 uint32_t
 fib6_calc_software_hash(const struct in6_addr *src, const struct in6_addr *dst,
     unsigned short src_port, unsigned short dst_port, char proto,
     uint32_t *phashtype)
 {
 	struct _hash_5tuple_ipv6 data;
 
 	data.src = *src;
 	data.dst = *dst;
 	data.src_port = src_port;
 	data.dst_port = dst_port;
 	data.proto = proto;
 	data.spare[0] = data.spare[1] = data.spare[2] = 0;
 
 	*phashtype = M_HASHTYPE_OPAQUE_HASH;
 
 	return (toeplitz_hash(MPATH_ENTROPY_KEY_LEN, mpath_entropy_key,
 	  sizeof(data), (uint8_t *)&data));
 }
 #endif
 
 /*
  * Looks up path in fib @fibnum specified by @dst.
  * Assumes scope is deembedded and provided in @scopeid.
  *
  * Returns path nexthop on success. Nexthop is safe to use
  *  within the current network epoch. If longer lifetime is required,
  *  one needs to pass NHR_REF as a flag. This will return referenced
  *  nexthop.
  */
 #ifdef FIB_ALGO
 struct nhop_object *
 fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6,
     uint32_t scopeid, uint32_t flags, uint32_t flowid)
 {
 	struct nhop_object *nh;
 	struct fib_dp *dp = &V_inet6_dp[fibnum];
 	struct flm_lookup_key key = {.addr6 = dst6 };
 
 	nh = dp->f(dp->arg, key, scopeid);
 	if (nh != NULL) {
 		nh = nhop_select(nh, flowid);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(nh->nh_ifp)) {
 			if (flags & NHR_REF)
 				nhop_ref_object(nh);
 			return (nh);
 		}
 	}
 	RTSTAT_INC(rts_unreach);
 	return (NULL);
 }
 #else
 struct nhop_object *
 fib6_lookup(uint32_t fibnum, const struct in6_addr *dst6,
     uint32_t scopeid, uint32_t flags, uint32_t flowid)
 {
 	RIB_RLOCK_TRACKER;
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct nhop_object *nh;
 
 	KASSERT((fibnum < rt_numfibs), ("fib6_lookup: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET6);
 	if (rh == NULL)
 		return (NULL);
 
 	struct sockaddr_in6 sin6 = {
 		.sin6_len = sizeof(struct sockaddr_in6),
 		.sin6_addr = *dst6,
 	};
 
 	/* Assume scopeid is valid and embed it directly */
 	if (IN6_IS_SCOPE_LINKLOCAL(dst6))
 		sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
 
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		nh = nhop_select((RNTORT(rn))->rt_nhop, flowid);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(nh->nh_ifp)) {
 			if (flags & NHR_REF)
 				nhop_ref_object(nh);
 			RIB_RUNLOCK(rh);
 			return (nh);
 		}
 	}
 	RIB_RUNLOCK(rh);
 
 	RTSTAT_INC(rts_unreach);
 	return (NULL);
 }
 #endif
 
 inline static int
 check_urpf_nhop(const struct nhop_object *nh, uint32_t flags,
     const struct ifnet *src_if)
 {
 
 	if (src_if != NULL && nh->nh_aifp == src_if) {
 		return (1);
 	}
 	if (src_if == NULL) {
 		if ((flags & NHR_NODEFAULT) == 0)
 			return (1);
 		else if ((nh->nh_flags & NHF_DEFAULT) == 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 check_urpf(struct nhop_object *nh, uint32_t flags,
     const struct ifnet *src_if)
 {
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh)) {
 		const struct weightened_nhop *wn;
 		uint32_t num_nhops;
 		wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
 		for (int i = 0; i < num_nhops; i++) {
 			if (check_urpf_nhop(wn[i].nh, flags, src_if) != 0)
 				return (1);
 		}
 		return (0);
 	} else
 #endif
 		return (check_urpf_nhop(nh, flags, src_if));
 }
 
 #ifndef FIB_ALGO
 static struct nhop_object *
 lookup_nhop(uint32_t fibnum, const struct in6_addr *dst6,
     uint32_t scopeid)
 {
 	RIB_RLOCK_TRACKER;
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct nhop_object *nh;
 
 	KASSERT((fibnum < rt_numfibs), ("fib6_check_urpf: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET6);
 	if (rh == NULL)
 		return (NULL);
 
 	/* Prepare lookup key */
 	struct sockaddr_in6 sin6 = {
 		.sin6_len = sizeof(struct sockaddr_in6),
 		.sin6_addr = *dst6,
 	};
 
 	/* Assume scopeid is valid and embed it directly */
 	if (IN6_IS_SCOPE_LINKLOCAL(dst6))
 		sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
 
 	nh = NULL;
 	RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0))
 		nh = RNTORT(rn)->rt_nhop;
 	RIB_RUNLOCK(rh);
 
 	return (nh);
 }
 #endif
 
 /*
  * Performs reverse path forwarding lookup.
  * If @src_if is non-zero, verifies that at least 1 path goes via
  *   this interface.
  * If @src_if is zero, verifies that route exist.
  * if @flags contains NHR_NOTDEFAULT, do not consider default route.
  *
  * Returns 1 if route matching conditions is found, 0 otherwise.
  */
 int
 fib6_check_urpf(uint32_t fibnum, const struct in6_addr *dst6,
     uint32_t scopeid, uint32_t flags, const struct ifnet *src_if)
 {
 	struct nhop_object *nh;
 #ifdef FIB_ALGO
 	struct fib_dp *dp = &V_inet6_dp[fibnum];
 	struct flm_lookup_key key = {.addr6 = dst6 };
 
 	nh = dp->f(dp->arg, key, scopeid);
 #else
 	nh = lookup_nhop(fibnum, dst6, scopeid);
 #endif
 	if (nh != NULL)
 		return (check_urpf(nh, flags, src_if));
 	return (0);
 }
 
 /*
  * Function returning prefix match data along with the nexthop data.
  * Intended to be used by the control plane code.
  * Supported flags:
  *  NHR_UNLOCKED: do not lock radix during lookup.
  * Returns pointer to rtentry and raw nexthop in @rnd. Both rtentry
  *  and nexthop are safe to use within current epoch. Note:
  * Note: rnd_nhop can actually be the nexthop group.
  */
 struct rtentry *
 fib6_lookup_rt(uint32_t fibnum, const struct in6_addr *dst6,
     uint32_t scopeid, uint32_t flags, struct route_nhop_data *rnd)
 {
 	RIB_RLOCK_TRACKER;
 	struct rib_head *rh;
 	struct radix_node *rn;
 	struct rtentry *rt;
 
 	KASSERT((fibnum < rt_numfibs), ("fib6_lookup: bad fibnum"));
 	rh = rt_tables_get_rnh(fibnum, AF_INET6);
 	if (rh == NULL)
 		return (NULL);
 
 	struct sockaddr_in6 sin6 = {
 		.sin6_len = sizeof(struct sockaddr_in6),
 		.sin6_addr = *dst6,
 	};
 
 	/* Assume scopeid is valid and embed it directly */
 	if (IN6_IS_SCOPE_LINKLOCAL(dst6))
 		sin6.sin6_addr.s6_addr16[1] = htons(scopeid & 0xffff);
 
 	rt = NULL;
 	if (!(flags & NHR_UNLOCKED))
 		RIB_RLOCK(rh);
 	rn = rh->rnh_matchaddr((void *)&sin6, &rh->head);
 	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		rt = (struct rtentry *)rn;
 		rnd->rnd_nhop = rt->rt_nhop;
 		rnd->rnd_weight = rt->rt_weight;
 	}
 	if (!(flags & NHR_UNLOCKED))
 		RIB_RUNLOCK(rh);
 
 	return (rt);
 }
 
 struct nhop_object *
 fib6_lookup_debugnet(uint32_t fibnum, const struct in6_addr *dst6,
     uint32_t scopeid, uint32_t flags)
 {
 	struct rtentry *rt;
 	struct route_nhop_data rnd;
 
 	rt = fib6_lookup_rt(fibnum, dst6, scopeid, NHR_UNLOCKED, &rnd);
 	if (rt != NULL) {
 		struct nhop_object *nh = nhop_select(rnd.rnd_nhop, 0);
 		/* Ensure route & ifp is UP */
 		if (RT_LINK_IS_UP(nh->nh_ifp))
 			return (nh);
 	}
 
 	return (NULL);
 }
 
 #endif
diff --git a/sys/netinet6/in6_gif.c b/sys/netinet6/in6_gif.c
index 54cb81c6130f..04f136b6bed5 100644
--- a/sys/netinet6/in6_gif.c
+++ b/sys/netinet6/in6_gif.c
@@ -1,484 +1,485 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * Copyright (c) 2018 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_gif.c,v 1.49 2001/05/14 14:02:17 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #ifdef INET
 #include <netinet/ip.h>
 #include <netinet/ip_ecn.h>
 #endif
 #include <netinet/ip_encap.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/ip6_ecn.h>
 #include <netinet6/in6_fib.h>
 
 #include <net/if_gif.h>
 
 #define GIF_HLIM	30
 VNET_DEFINE_STATIC(int, ip6_gif_hlim) = GIF_HLIM;
 #define	V_ip6_gif_hlim			VNET(ip6_gif_hlim)
 
 SYSCTL_DECL(_net_inet6_ip6);
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_GIF_HLIM, gifhlim,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_gif_hlim), 0,
     "Default hop limit for encapsulated packets");
 
 /*
  * We keep interfaces in a hash table using src+dst as key.
  * Interfaces with GIF_IGNORE_SOURCE flag are linked into plain list.
  */
 VNET_DEFINE_STATIC(struct gif_list *, ipv6_hashtbl) = NULL;
 VNET_DEFINE_STATIC(struct gif_list *, ipv6_srchashtbl) = NULL;
 VNET_DEFINE_STATIC(struct gif_list, ipv6_list) = CK_LIST_HEAD_INITIALIZER();
 #define	V_ipv6_hashtbl		VNET(ipv6_hashtbl)
 #define	V_ipv6_srchashtbl	VNET(ipv6_srchashtbl)
 #define	V_ipv6_list		VNET(ipv6_list)
 
 #define	GIF_HASH(src, dst)	(V_ipv6_hashtbl[\
     in6_gif_hashval((src), (dst)) & (GIF_HASH_SIZE - 1)])
 #define	GIF_SRCHASH(src)	(V_ipv6_srchashtbl[\
     fnv_32_buf((src), sizeof(*src), FNV1_32_INIT) & (GIF_HASH_SIZE - 1)])
 #define	GIF_HASH_SC(sc)		GIF_HASH(&(sc)->gif_ip6hdr->ip6_src,\
     &(sc)->gif_ip6hdr->ip6_dst)
 static uint32_t
 in6_gif_hashval(const struct in6_addr *src, const struct in6_addr *dst)
 {
 	uint32_t ret;
 
 	ret = fnv_32_buf(src, sizeof(*src), FNV1_32_INIT);
 	return (fnv_32_buf(dst, sizeof(*dst), ret));
 }
 
 static int
 in6_gif_checkdup(const struct gif_softc *sc, const struct in6_addr *src,
     const struct in6_addr *dst)
 {
 	struct gif_softc *tmp;
 
 	if (sc->gif_family == AF_INET6 &&
 	    IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src, src) &&
 	    IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_dst, dst))
 		return (EEXIST);
 
 	CK_LIST_FOREACH(tmp, &GIF_HASH(src, dst), chain) {
 		if (tmp == sc)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&tmp->gif_ip6hdr->ip6_src, src) &&
 		    IN6_ARE_ADDR_EQUAL(&tmp->gif_ip6hdr->ip6_dst, dst))
 			return (EADDRNOTAVAIL);
 	}
 	return (0);
 }
 
 /*
  * Check that ingress address belongs to local host.
  */
 static void
 in6_gif_set_running(struct gif_softc *sc)
 {
 
 	if (in6_localip(&sc->gif_ip6hdr->ip6_src))
 		GIF2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING;
 	else
 		GIF2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 }
 
 /*
  * ifaddr_event handler.
  * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent
  * source address spoofing.
  */
 static void
 in6_gif_srcaddr(void *arg __unused, const struct sockaddr *sa, int event)
 {
 	const struct sockaddr_in6 *sin;
 	struct gif_softc *sc;
 
 	/* Check that VNET is ready */
 	if (V_ipv6_hashtbl == NULL)
 		return;
 
 	NET_EPOCH_ASSERT();
 	sin = (const struct sockaddr_in6 *)sa;
 	CK_LIST_FOREACH(sc, &GIF_SRCHASH(&sin->sin6_addr), srchash) {
 		if (IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src,
 		    &sin->sin6_addr) == 0)
 			continue;
 		in6_gif_set_running(sc);
 	}
 }
 
 static void
 in6_gif_attach(struct gif_softc *sc)
 {
 
 	if (sc->gif_options & GIF_IGNORE_SOURCE)
 		CK_LIST_INSERT_HEAD(&V_ipv6_list, sc, chain);
 	else
 		CK_LIST_INSERT_HEAD(&GIF_HASH_SC(sc), sc, chain);
 
 	CK_LIST_INSERT_HEAD(&GIF_SRCHASH(&sc->gif_ip6hdr->ip6_src),
 	    sc, srchash);
 }
 
 int
 in6_gif_setopts(struct gif_softc *sc, u_int options)
 {
 
 	/* NOTE: we are protected with gif_ioctl_sx lock */
 	MPASS(sc->gif_family == AF_INET6);
 	MPASS(sc->gif_options != options);
 
 	if ((options & GIF_IGNORE_SOURCE) !=
 	    (sc->gif_options & GIF_IGNORE_SOURCE)) {
 		CK_LIST_REMOVE(sc, srchash);
 		CK_LIST_REMOVE(sc, chain);
 		sc->gif_options = options;
 		in6_gif_attach(sc);
 	}
 	return (0);
 }
 
 int
 in6_gif_ioctl(struct gif_softc *sc, u_long cmd, caddr_t data)
 {
 	struct in6_ifreq *ifr = (struct in6_ifreq *)data;
 	struct sockaddr_in6 *dst, *src;
 	struct ip6_hdr *ip6;
 	int error;
 
 	/* NOTE: we are protected with gif_ioctl_sx lock */
 	error = EINVAL;
 	switch (cmd) {
 	case SIOCSIFPHYADDR_IN6:
 		src = &((struct in6_aliasreq *)data)->ifra_addr;
 		dst = &((struct in6_aliasreq *)data)->ifra_dstaddr;
 
 		/* sanity checks */
 		if (src->sin6_family != dst->sin6_family ||
 		    src->sin6_family != AF_INET6 ||
 		    src->sin6_len != dst->sin6_len ||
 		    src->sin6_len != sizeof(*src))
 			break;
 		if (IN6_IS_ADDR_UNSPECIFIED(&src->sin6_addr) ||
 		    IN6_IS_ADDR_UNSPECIFIED(&dst->sin6_addr)) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		/*
 		 * Check validity of the scope zone ID of the
 		 * addresses, and convert it into the kernel
 		 * internal form if necessary.
 		 */
 		if ((error = sa6_embedscope(src, 0)) != 0 ||
 		    (error = sa6_embedscope(dst, 0)) != 0)
 			break;
 
 		if (V_ipv6_hashtbl == NULL) {
 			V_ipv6_hashtbl = gif_hashinit();
 			V_ipv6_srchashtbl = gif_hashinit();
 		}
 		error = in6_gif_checkdup(sc, &src->sin6_addr,
 		    &dst->sin6_addr);
 		if (error == EADDRNOTAVAIL)
 			break;
 		if (error == EEXIST) {
 			/* Addresses are the same. Just return. */
 			error = 0;
 			break;
 		}
 		ip6 = malloc(sizeof(*ip6), M_GIF, M_WAITOK | M_ZERO);
 		ip6->ip6_src = src->sin6_addr;
 		ip6->ip6_dst = dst->sin6_addr;
 		ip6->ip6_vfc = IPV6_VERSION;
 		if (sc->gif_family != 0) {
 			/* Detach existing tunnel first */
 			CK_LIST_REMOVE(sc, srchash);
 			CK_LIST_REMOVE(sc, chain);
 			GIF_WAIT();
 			free(sc->gif_hdr, M_GIF);
 			/* XXX: should we notify about link state change? */
 		}
 		sc->gif_family = AF_INET6;
 		sc->gif_ip6hdr = ip6;
 		in6_gif_attach(sc);
 		in6_gif_set_running(sc);
 		break;
 	case SIOCGIFPSRCADDR_IN6:
 	case SIOCGIFPDSTADDR_IN6:
 		if (sc->gif_family != AF_INET6) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		src = (struct sockaddr_in6 *)&ifr->ifr_addr;
 		memset(src, 0, sizeof(*src));
 		src->sin6_family = AF_INET6;
 		src->sin6_len = sizeof(*src);
 		src->sin6_addr = (cmd == SIOCGIFPSRCADDR_IN6) ?
 		    sc->gif_ip6hdr->ip6_src: sc->gif_ip6hdr->ip6_dst;
 		error = prison_if(curthread->td_ucred, (struct sockaddr *)src);
 		if (error == 0)
 			error = sa6_recoverscope(src);
 		if (error != 0)
 			memset(src, 0, sizeof(*src));
 		break;
 	}
 	return (error);
 }
 
 int
 in6_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
 {
 	struct gif_softc *sc = ifp->if_softc;
 	struct ip6_hdr *ip6;
 	int len;
 
 	/* prepend new IP header */
 	NET_EPOCH_ASSERT();
 	len = sizeof(struct ip6_hdr);
 #ifndef __NO_STRICT_ALIGNMENT
 	if (proto == IPPROTO_ETHERIP)
 		len += ETHERIP_ALIGN;
 #endif
 	M_PREPEND(m, len, M_NOWAIT);
 	if (m == NULL)
 		return (ENOBUFS);
 #ifndef __NO_STRICT_ALIGNMENT
 	if (proto == IPPROTO_ETHERIP) {
 		len = mtod(m, vm_offset_t) & 3;
 		KASSERT(len == 0 || len == ETHERIP_ALIGN,
 		    ("in6_gif_output: unexpected misalignment"));
 		m->m_data += len;
 		m->m_len -= ETHERIP_ALIGN;
 	}
 #endif
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	MPASS(sc->gif_family == AF_INET6);
 	bcopy(sc->gif_ip6hdr, ip6, sizeof(struct ip6_hdr));
 
 	ip6->ip6_flow  |= htonl((uint32_t)ecn << 20);
 	ip6->ip6_nxt	= proto;
 	ip6->ip6_hlim	= V_ip6_gif_hlim;
 	/*
 	 * force fragmentation to minimum MTU, to avoid path MTU discovery.
 	 * it is too painful to ask for resend of inner packet, to achieve
 	 * path MTU discovery for encapsulated packets.
 	 */
 	return (ip6_output(m, 0, NULL, IPV6_MINMTU, 0, NULL, NULL));
 }
 
 static int
 in6_gif_input(struct mbuf *m, int off, int proto, void *arg)
 {
 	struct gif_softc *sc = arg;
 	struct ifnet *gifp;
 	struct ip6_hdr *ip6;
 	uint8_t ecn;
 
 	NET_EPOCH_ASSERT();
 	if (sc == NULL) {
 		m_freem(m);
 		IP6STAT_INC(ip6s_nogif);
 		return (IPPROTO_DONE);
 	}
 	gifp = GIF2IFP(sc);
 	if ((gifp->if_flags & IFF_UP) != 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		ecn = IPV6_TRAFFIC_CLASS(ip6);
 		m_adj(m, off);
 		gif_input(m, gifp, proto, ecn);
 	} else {
 		m_freem(m);
 		IP6STAT_INC(ip6s_nogif);
 	}
 	return (IPPROTO_DONE);
 }
 
 static int
 in6_gif_lookup(const struct mbuf *m, int off, int proto, void **arg)
 {
 	const struct ip6_hdr *ip6;
 	struct gif_softc *sc;
 	int ret;
 
 	if (V_ipv6_hashtbl == NULL)
 		return (0);
 
 	NET_EPOCH_ASSERT();
 	/*
 	 * NOTE: it is safe to iterate without any locking here, because softc
 	 * can be reclaimed only when we are not within net_epoch_preempt
 	 * section, but ip_encap lookup+input are executed in epoch section.
 	 */
 	ip6 = mtod(m, const struct ip6_hdr *);
 	ret = 0;
 	CK_LIST_FOREACH(sc, &GIF_HASH(&ip6->ip6_dst, &ip6->ip6_src), chain) {
 		/*
 		 * This is an inbound packet, its ip6_dst is source address
 		 * in softc.
 		 */
 		if (IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src,
 		    &ip6->ip6_dst) &&
 		    IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_dst,
 		    &ip6->ip6_src)) {
 			ret = ENCAP_DRV_LOOKUP;
 			goto done;
 		}
 	}
 	/*
 	 * No exact match.
 	 * Check the list of interfaces with GIF_IGNORE_SOURCE flag.
 	 */
 	CK_LIST_FOREACH(sc, &V_ipv6_list, chain) {
 		if (IN6_ARE_ADDR_EQUAL(&sc->gif_ip6hdr->ip6_src,
 		    &ip6->ip6_dst)) {
 			ret = 128 + 8; /* src + proto */
 			goto done;
 		}
 	}
 	return (0);
 done:
 	if ((GIF2IFP(sc)->if_flags & IFF_UP) == 0)
 		return (0);
 	/* ingress filters on outer source */
 	if ((GIF2IFP(sc)->if_flags & IFF_LINK2) == 0) {
 		if (fib6_check_urpf(sc->gif_fibnum, &ip6->ip6_src,
 		    ntohs(in6_getscope(&ip6->ip6_src)), NHR_NONE,
 		    m->m_pkthdr.rcvif) == 0)
 			return (0);
 	}
 	*arg = sc;
 	return (ret);
 }
 
 static const struct srcaddrtab *ipv6_srcaddrtab;
 static struct {
 	const struct encap_config encap;
 	const struct encaptab *cookie;
 } ipv6_encap_cfg[] = {
 #ifdef INET
 	{
 		.encap = {
 			.proto = IPPROTO_IPV4,
 			.min_length = sizeof(struct ip6_hdr) +
 			    sizeof(struct ip),
 			.exact_match = ENCAP_DRV_LOOKUP,
 			.lookup = in6_gif_lookup,
 			.input = in6_gif_input
 		},
 	},
 #endif
 	{
 		.encap = {
 			.proto = IPPROTO_IPV6,
 			.min_length = 2 * sizeof(struct ip6_hdr),
 			.exact_match = ENCAP_DRV_LOOKUP,
 			.lookup = in6_gif_lookup,
 			.input = in6_gif_input
 		},
 	},
 	{
 		.encap = {
 			.proto = IPPROTO_ETHERIP,
 			.min_length = sizeof(struct ip6_hdr) +
 			    sizeof(struct etherip_header) +
 			    sizeof(struct ether_header),
 			.exact_match = ENCAP_DRV_LOOKUP,
 			.lookup = in6_gif_lookup,
 			.input = in6_gif_input
 		},
 	}
 };
 
 void
 in6_gif_init(void)
 {
 	int i;
 
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	ipv6_srcaddrtab = ip6_encap_register_srcaddr(in6_gif_srcaddr,
 	    NULL, M_WAITOK);
 	for (i = 0; i < nitems(ipv6_encap_cfg); i++)
 		ipv6_encap_cfg[i].cookie = ip6_encap_attach(
 		    &ipv6_encap_cfg[i].encap, NULL, M_WAITOK);
 }
 
 void
 in6_gif_uninit(void)
 {
 	int i;
 
 	if (IS_DEFAULT_VNET(curvnet)) {
 		for (i = 0; i < nitems(ipv6_encap_cfg); i++)
 			ip6_encap_detach(ipv6_encap_cfg[i].cookie);
 		ip6_encap_unregister_srcaddr(ipv6_srcaddrtab);
 	}
 	if (V_ipv6_hashtbl != NULL) {
 		gif_hashdestroy(V_ipv6_hashtbl);
 		V_ipv6_hashtbl = NULL;
 		GIF_WAIT();
 		gif_hashdestroy(V_ipv6_srchashtbl);
 	}
 }
diff --git a/sys/netinet6/in6_ifattach.c b/sys/netinet6/in6_ifattach.c
index 629509f61ac1..b0b2e4e95985 100644
--- a/sys/netinet6/in6_ifattach.c
+++ b/sys/netinet6/in6_ifattach.c
@@ -1,904 +1,905 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_ifattach.c,v 1.118 2001/05/24 07:44:00 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/rmlock.h>
 #include <sys/syslog.h>
 #include <sys/md5.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/scope6_var.h>
 
 VNET_DEFINE(unsigned long, in6_maxmtu) = 0;
 
 #ifdef IP6_AUTO_LINKLOCAL
 VNET_DEFINE(int, ip6_auto_linklocal) = IP6_AUTO_LINKLOCAL;
 #else
 VNET_DEFINE(int, ip6_auto_linklocal) = 1;	/* enabled by default */
 #endif
 
 VNET_DEFINE(struct callout, in6_tmpaddrtimer_ch);
 #define	V_in6_tmpaddrtimer_ch		VNET(in6_tmpaddrtimer_ch)
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 #define	V_ripcbinfo			VNET(ripcbinfo)
 
 static int get_rand_ifid(struct ifnet *, struct in6_addr *);
 static int generate_tmp_ifid(u_int8_t *, const u_int8_t *, u_int8_t *);
 static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *);
 static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *);
 static int in6_ifattach_loopback(struct ifnet *);
 static void in6_purgemaddrs(struct ifnet *);
 
 #define EUI64_GBIT	0x01
 #define EUI64_UBIT	0x02
 #define EUI64_TO_IFID(in6)	do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (0)
 #define EUI64_GROUP(in6)	((in6)->s6_addr[8] & EUI64_GBIT)
 #define EUI64_INDIVIDUAL(in6)	(!EUI64_GROUP(in6))
 #define EUI64_LOCAL(in6)	((in6)->s6_addr[8] & EUI64_UBIT)
 #define EUI64_UNIVERSAL(in6)	(!EUI64_LOCAL(in6))
 
 #define IFID_LOCAL(in6)		(!EUI64_LOCAL(in6))
 #define IFID_UNIVERSAL(in6)	(!EUI64_UNIVERSAL(in6))
 
 /*
  * Generate a last-resort interface identifier, when the machine has no
  * IEEE802/EUI64 address sources.
  * The goal here is to get an interface identifier that is
  * (1) random enough and (2) does not change across reboot.
  * We currently use MD5(hostname) for it.
  *
  * in6 - upper 64bits are preserved
  */
 static int
 get_rand_ifid(struct ifnet *ifp, struct in6_addr *in6)
 {
 	MD5_CTX ctxt;
 	struct prison *pr;
 	u_int8_t digest[16];
 	int hostnamelen;
 
 	pr = curthread->td_ucred->cr_prison;
 	mtx_lock(&pr->pr_mtx);
 	hostnamelen = strlen(pr->pr_hostname);
 #if 0
 	/* we need at least several letters as seed for ifid */
 	if (hostnamelen < 3) {
 		mtx_unlock(&pr->pr_mtx);
 		return -1;
 	}
 #endif
 
 	/* generate 8 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, pr->pr_hostname, hostnamelen);
 	mtx_unlock(&pr->pr_mtx);
 	MD5Final(digest, &ctxt);
 
 	/* assumes sizeof(digest) > sizeof(ifid) */
 	bcopy(digest, &in6->s6_addr[8], 8);
 
 	/* make sure to set "u" bit to local, and "g" bit to individual. */
 	in6->s6_addr[8] &= ~EUI64_GBIT;	/* g bit to "individual" */
 	in6->s6_addr[8] |= EUI64_UBIT;	/* u bit to "local" */
 
 	/* convert EUI64 into IPv6 interface identifier */
 	EUI64_TO_IFID(in6);
 
 	return 0;
 }
 
 static int
 generate_tmp_ifid(u_int8_t *seed0, const u_int8_t *seed1, u_int8_t *ret)
 {
 	MD5_CTX ctxt;
 	u_int8_t seed[16], digest[16], nullbuf[8];
 	u_int32_t val32;
 
 	/* If there's no history, start with a random seed. */
 	bzero(nullbuf, sizeof(nullbuf));
 	if (bcmp(nullbuf, seed0, sizeof(nullbuf)) == 0) {
 		int i;
 
 		for (i = 0; i < 2; i++) {
 			val32 = arc4random();
 			bcopy(&val32, seed + sizeof(val32) * i, sizeof(val32));
 		}
 	} else
 		bcopy(seed0, seed, 8);
 
 	/* copy the right-most 64-bits of the given address */
 	/* XXX assumption on the size of IFID */
 	bcopy(seed1, &seed[8], 8);
 
 	if (0) {		/* for debugging purposes only */
 		int i;
 
 		printf("generate_tmp_ifid: new randomized ID from: ");
 		for (i = 0; i < 16; i++)
 			printf("%02x", seed[i]);
 		printf(" ");
 	}
 
 	/* generate 16 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, seed, sizeof(seed));
 	MD5Final(digest, &ctxt);
 
 	/*
 	 * RFC 3041 3.2.1. (3)
 	 * Take the left-most 64-bits of the MD5 digest and set bit 6 (the
 	 * left-most bit is numbered 0) to zero.
 	 */
 	bcopy(digest, ret, 8);
 	ret[0] &= ~EUI64_UBIT;
 
 	/*
 	 * XXX: we'd like to ensure that the generated value is not zero
 	 * for simplicity.  If the caclculated digest happens to be zero,
 	 * use a random non-zero value as the last resort.
 	 */
 	if (bcmp(nullbuf, ret, sizeof(nullbuf)) == 0) {
 		nd6log((LOG_INFO,
 		    "generate_tmp_ifid: computed MD5 value is zero.\n"));
 
 		val32 = arc4random();
 		val32 = 1 + (val32 % (0xffffffff - 1));
 	}
 
 	/*
 	 * RFC 3041 3.2.1. (4)
 	 * Take the rightmost 64-bits of the MD5 digest and save them in
 	 * stable storage as the history value to be used in the next
 	 * iteration of the algorithm.
 	 */
 	bcopy(&digest[8], seed0, 8);
 
 	if (0) {		/* for debugging purposes only */
 		int i;
 
 		printf("to: ");
 		for (i = 0; i < 16; i++)
 			printf("%02x", digest[i]);
 		printf("\n");
 	}
 
 	return 0;
 }
 
 /*
  * Get interface identifier for the specified interface.
  * XXX assumes single sockaddr_dl (AF_LINK address) per an interface
  *
  * in6 - upper 64bits are preserved
  */
 int
 in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6)
 {
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	u_int8_t *addr;
 	size_t addrlen;
 	static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
 	static u_int8_t allone[8] =
 		{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
 
 	NET_EPOCH_ASSERT();
 
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_LINK)
 			continue;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		if (sdl == NULL)
 			continue;
 		if (sdl->sdl_alen == 0)
 			continue;
 
 		goto found;
 	}
 
 	return -1;
 
 found:
 	addr = LLADDR(sdl);
 	addrlen = sdl->sdl_alen;
 
 	/* get EUI64 */
 	switch (ifp->if_type) {
 	case IFT_BRIDGE:
 	case IFT_ETHER:
 	case IFT_L2VLAN:
 	case IFT_ATM:
 	case IFT_IEEE1394:
 		/* IEEE802/EUI64 cases - what others? */
 		/* IEEE1394 uses 16byte length address starting with EUI64 */
 		if (addrlen > 8)
 			addrlen = 8;
 
 		/* look at IEEE802/EUI64 only */
 		if (addrlen != 8 && addrlen != 6)
 			return -1;
 
 		/*
 		 * check for invalid MAC address - on bsdi, we see it a lot
 		 * since wildboar configures all-zero MAC on pccard before
 		 * card insertion.
 		 */
 		if (bcmp(addr, allzero, addrlen) == 0)
 			return -1;
 		if (bcmp(addr, allone, addrlen) == 0)
 			return -1;
 
 		/* make EUI64 address */
 		if (addrlen == 8)
 			bcopy(addr, &in6->s6_addr[8], 8);
 		else if (addrlen == 6) {
 			in6->s6_addr[8] = addr[0];
 			in6->s6_addr[9] = addr[1];
 			in6->s6_addr[10] = addr[2];
 			in6->s6_addr[11] = 0xff;
 			in6->s6_addr[12] = 0xfe;
 			in6->s6_addr[13] = addr[3];
 			in6->s6_addr[14] = addr[4];
 			in6->s6_addr[15] = addr[5];
 		}
 		break;
 
 	case IFT_GIF:
 	case IFT_STF:
 		/*
 		 * RFC2893 says: "SHOULD use IPv4 address as ifid source".
 		 * however, IPv4 address is not very suitable as unique
 		 * identifier source (can be renumbered).
 		 * we don't do this.
 		 */
 		return -1;
 
 	case IFT_INFINIBAND:
 		if (addrlen != 20)
 			return -1;
 		bcopy(addr + 12, &in6->s6_addr[8], 8);
 		break;
 
 	default:
 		return -1;
 	}
 
 	/* sanity check: g bit must not indicate "group" */
 	if (EUI64_GROUP(in6))
 		return -1;
 
 	/* convert EUI64 into IPv6 interface identifier */
 	EUI64_TO_IFID(in6);
 
 	/*
 	 * sanity check: ifid must not be all zero, avoid conflict with
 	 * subnet router anycast
 	 */
 	if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 &&
 	    bcmp(&in6->s6_addr[9], allzero, 7) == 0)
 		return -1;
 
 	return 0;
 }
 
 /*
  * Get interface identifier for the specified interface.  If it is not
  * available on ifp0, borrow interface identifier from other information
  * sources.
  *
  * altifp - secondary EUI64 source
  */
 static int
 get_ifid(struct ifnet *ifp0, struct ifnet *altifp,
     struct in6_addr *in6)
 {
 	struct ifnet *ifp;
 
 	NET_EPOCH_ASSERT();
 
 	/* first, try to get it from the interface itself */
 	if (in6_get_hw_ifid(ifp0, in6) == 0) {
 		nd6log((LOG_DEBUG, "%s: got interface identifier from itself\n",
 		    if_name(ifp0)));
 		goto success;
 	}
 
 	/* try secondary EUI64 source. this basically is for ATM PVC */
 	if (altifp && in6_get_hw_ifid(altifp, in6) == 0) {
 		nd6log((LOG_DEBUG, "%s: got interface identifier from %s\n",
 		    if_name(ifp0), if_name(altifp)));
 		goto success;
 	}
 
 	/* next, try to get it from some other hardware interface */
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp == ifp0)
 			continue;
 		if (in6_get_hw_ifid(ifp, in6) != 0)
 			continue;
 
 		/*
 		 * to borrow ifid from other interface, ifid needs to be
 		 * globally unique
 		 */
 		if (IFID_UNIVERSAL(in6)) {
 			nd6log((LOG_DEBUG,
 			    "%s: borrow interface identifier from %s\n",
 			    if_name(ifp0), if_name(ifp)));
 			goto success;
 		}
 	}
 
 	/* last resort: get from random number source */
 	if (get_rand_ifid(ifp, in6) == 0) {
 		nd6log((LOG_DEBUG,
 		    "%s: interface identifier generated by random number\n",
 		    if_name(ifp0)));
 		goto success;
 	}
 
 	printf("%s: failed to get interface identifier\n", if_name(ifp0));
 	return -1;
 
 success:
 	nd6log((LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
 	    if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10],
 	    in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13],
 	    in6->s6_addr[14], in6->s6_addr[15]));
 	return 0;
 }
 
 /*
  * altifp - secondary EUI64 source
  */
 static int
 in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp)
 {
 	struct in6_ifaddr *ia;
 	struct in6_aliasreq ifra;
 	struct nd_prefixctl pr0;
 	struct epoch_tracker et;
 	struct nd_prefix *pr;
 	int error;
 
 	/*
 	 * configure link-local address.
 	 */
 	in6_prepare_ifra(&ifra, NULL, &in6mask64);
 
 	ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000);
 	ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0;
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0;
 		ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1);
 	} else {
 		NET_EPOCH_ENTER(et);
 		error = get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr);
 		NET_EPOCH_EXIT(et);
 		if (error != 0) {
 			nd6log((LOG_ERR,
 			    "%s: no ifid available\n", if_name(ifp)));
 			return (-1);
 		}
 	}
 	if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL))
 		return (-1);
 
 	/* link-local addresses should NEVER expire. */
 	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
 	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
 
 	/*
 	 * Now call in6_update_ifa() to do a bunch of procedures to configure
 	 * a link-local address. We can set the 3rd argument to NULL, because
 	 * we know there's no other link-local address on the interface
 	 * and therefore we are adding one (instead of updating one).
 	 */
 	if ((error = in6_update_ifa(ifp, &ifra, NULL,
 				    IN6_IFAUPDATE_DADDELAY)) != 0) {
 		/*
 		 * XXX: When the interface does not support IPv6, this call
 		 * would fail in the SIOCSIFADDR ioctl.  I believe the
 		 * notification is rather confusing in this case, so just
 		 * suppress it.  (jinmei@kame.net 20010130)
 		 */
 		if (error != EAFNOSUPPORT)
 			nd6log((LOG_NOTICE, "in6_ifattach_linklocal: failed to "
 			    "configure a link-local address on %s "
 			    "(errno=%d)\n",
 			    if_name(ifp), error));
 		return (-1);
 	}
 
 	NET_EPOCH_ENTER(et);
 	ia = in6ifa_ifpforlinklocal(ifp, 0);
 	NET_EPOCH_EXIT(et);
 	if (ia == NULL) {
 		/*
 		 * Another thread removed the address that we just added.
 		 * This should be rare, but it happens.
 		 */
 		nd6log((LOG_NOTICE, "%s: %s: new link-local address "
 			"disappeared\n", __func__, if_name(ifp)));
 		return (-1);
 	}
 	ifa_free(&ia->ia_ifa);
 
 	/*
 	 * Make the link-local prefix (fe80::%link/64) as on-link.
 	 * Since we'd like to manage prefixes separately from addresses,
 	 * we make an ND6 prefix structure for the link-local prefix,
 	 * and add it to the prefix list as a never-expire prefix.
 	 * XXX: this change might affect some existing code base...
 	 */
 	bzero(&pr0, sizeof(pr0));
 	pr0.ndpr_ifp = ifp;
 	/* this should be 64 at this moment. */
 	pr0.ndpr_plen = in6_mask2len(&ifra.ifra_prefixmask.sin6_addr, NULL);
 	pr0.ndpr_prefix = ifra.ifra_addr;
 	/* apply the mask for safety. (nd6_prelist_add will apply it again) */
 	IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &in6mask64);
 	/*
 	 * Initialize parameters.  The link-local prefix must always be
 	 * on-link, and its lifetimes never expire.
 	 */
 	pr0.ndpr_raf_onlink = 1;
 	pr0.ndpr_raf_auto = 1;	/* probably meaningless */
 	pr0.ndpr_vltime = ND6_INFINITE_LIFETIME;
 	pr0.ndpr_pltime = ND6_INFINITE_LIFETIME;
 	/*
 	 * Since there is no other link-local addresses, nd6_prefix_lookup()
 	 * probably returns NULL.  However, we cannot always expect the result.
 	 * For example, if we first remove the (only) existing link-local
 	 * address, and then reconfigure another one, the prefix is still
 	 * valid with referring to the old link-local address.
 	 */
 	if ((pr = nd6_prefix_lookup(&pr0)) == NULL) {
 		if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0)
 			return (error);
 		/* Reference prefix */
 		ia->ia6_ndpr = pr;
 		pr->ndpr_addrcnt++;
 	} else
 		nd6_prefix_rele(pr);
 
 	return 0;
 }
 
 /*
  * ifp - must be IFT_LOOP
  */
 static int
 in6_ifattach_loopback(struct ifnet *ifp)
 {
 	struct in6_aliasreq ifra;
 	int error;
 
 	in6_prepare_ifra(&ifra, &in6addr_loopback, &in6mask128);
 
 	/*
 	 * Always initialize ia_dstaddr (= broadcast address) to loopback
 	 * address.  Follows IPv4 practice - see in_ifinit().
 	 */
 	ifra.ifra_dstaddr.sin6_len = sizeof(struct sockaddr_in6);
 	ifra.ifra_dstaddr.sin6_family = AF_INET6;
 	ifra.ifra_dstaddr.sin6_addr = in6addr_loopback;
 
 	/* the loopback  address should NEVER expire. */
 	ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
 	ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
 
 	/*
 	 * We are sure that this is a newly assigned address, so we can set
 	 * NULL to the 3rd arg.
 	 */
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, 0)) != 0) {
 		nd6log((LOG_ERR, "in6_ifattach_loopback: failed to configure "
 		    "the loopback address on %s (errno=%d)\n",
 		    if_name(ifp), error));
 		return (-1);
 	}
 
 	return 0;
 }
 
 /*
  * compute NI group address, based on the current hostname setting.
  * see RFC 4620.
  *
  * when ifp == NULL, the caller is responsible for filling scopeid.
  *
  * If oldmcprefix == 1, FF02:0:0:0:0:2::/96 is used for NI group address
  * while it is FF02:0:0:0:0:2:FF00::/104 in RFC 4620. 
  */
 static int
 in6_nigroup0(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6, int oldmcprefix)
 {
 	struct prison *pr;
 	const char *p;
 	u_char *q;
 	MD5_CTX ctxt;
 	u_int8_t digest[16];
 	char l;
 	char n[64];	/* a single label must not exceed 63 chars */
 
 	/*
 	 * If no name is given and namelen is -1,
 	 * we try to do the hostname lookup ourselves.
 	 */
 	if (!name && namelen == -1) {
 		pr = curthread->td_ucred->cr_prison;
 		mtx_lock(&pr->pr_mtx);
 		name = pr->pr_hostname;
 		namelen = strlen(name);
 	} else
 		pr = NULL;
 	if (!name || !namelen) {
 		if (pr != NULL)
 			mtx_unlock(&pr->pr_mtx);
 		return -1;
 	}
 
 	p = name;
 	while (p && *p && *p != '.' && p - name < namelen)
 		p++;
 	if (p == name || p - name > sizeof(n) - 1) {
 		if (pr != NULL)
 			mtx_unlock(&pr->pr_mtx);
 		return -1;	/* label too long */
 	}
 	l = p - name;
 	strncpy(n, name, l);
 	if (pr != NULL)
 		mtx_unlock(&pr->pr_mtx);
 	n[(int)l] = '\0';
 	for (q = n; *q; q++) {
 		if ('A' <= *q && *q <= 'Z')
 			*q = *q - 'A' + 'a';
 	}
 
 	/* generate 16 bytes of pseudo-random value. */
 	bzero(&ctxt, sizeof(ctxt));
 	MD5Init(&ctxt);
 	MD5Update(&ctxt, &l, sizeof(l));
 	MD5Update(&ctxt, n, l);
 	MD5Final(digest, &ctxt);
 
 	bzero(in6, sizeof(*in6));
 	in6->s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 	in6->s6_addr8[11] = 2;
 	if (oldmcprefix == 0) {
 		in6->s6_addr8[12] = 0xff;
 	 	/* Copy the first 24 bits of 128-bit hash into the address. */
 		bcopy(digest, &in6->s6_addr8[13], 3);
 	} else {
 	 	/* Copy the first 32 bits of 128-bit hash into the address. */
 		bcopy(digest, &in6->s6_addr32[3], sizeof(in6->s6_addr32[3]));
 	}
 	if (in6_setscope(in6, ifp, NULL))
 		return (-1); /* XXX: should not fail */
 
 	return 0;
 }
 
 int
 in6_nigroup(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6)
 {
 
 	return (in6_nigroup0(ifp, name, namelen, in6, 0));
 }
 
 int
 in6_nigroup_oldmcprefix(struct ifnet *ifp, const char *name, int namelen,
     struct in6_addr *in6)
 {
 
 	return (in6_nigroup0(ifp, name, namelen, in6, 1));
 }
 
 /*
  * XXX multiple loopback interface needs more care.  for instance,
  * nodelocal address needs to be configured onto only one of them.
  * XXX multiple link-local address case
  *
  * altifp - secondary EUI64 source
  */
 void
 in6_ifattach(struct ifnet *ifp, struct ifnet *altifp)
 {
 	struct in6_ifaddr *ia;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 	/*
 	 * quirks based on interface type
 	 */
 	switch (ifp->if_type) {
 	case IFT_STF:
 		/*
 		 * 6to4 interface is a very special kind of beast.
 		 * no multicast, no linklocal.  RFC2529 specifies how to make
 		 * linklocals for 6to4 interface, but there's no use and
 		 * it is rather harmful to have one.
 		 */
 		ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL;
 		ND_IFINFO(ifp)->flags |= ND6_IFF_NO_DAD;
 		break;
 	default:
 		break;
 	}
 
 	/*
 	 * usually, we require multicast capability to the interface
 	 */
 	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 		nd6log((LOG_INFO, "in6_ifattach: "
 		    "%s is not multicast capable, IPv6 not enabled\n",
 		    if_name(ifp)));
 		return;
 	}
 
 	/*
 	 * assign loopback address for loopback interface.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
 		/*
 		 * check that loopback address doesn't exist yet.
 		 */
 		ia = in6ifa_ifwithaddr(&in6addr_loopback, 0, false);
 		if (ia == NULL)
 			in6_ifattach_loopback(ifp);
 	}
 
 	/*
 	 * assign a link-local address, if there's none.
 	 */
 	if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 	    ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) {
 		struct epoch_tracker et;
 
 		NET_EPOCH_ENTER(et);
 		ia = in6ifa_ifpforlinklocal(ifp, 0);
 		NET_EPOCH_EXIT(et);
 		if (ia == NULL)
 			in6_ifattach_linklocal(ifp, altifp);
 		else
 			ifa_free(&ia->ia_ifa);
 	}
 
 	/* update dynamically. */
 	if (V_in6_maxmtu < ifp->if_mtu)
 		V_in6_maxmtu = ifp->if_mtu;
 }
 
 /*
  * NOTE: in6_ifdetach() does not support loopback if at this moment.
  *
  * When shutting down a VNET we clean up layers top-down.  In that case
  * upper layer protocols (ulp) are cleaned up already and locks are destroyed
  * and we must not call into these cleanup functions anymore, thus purgeulp
  * is set to 0 in that case by in6_ifdetach_destroy().
  * The normal case of destroying a (cloned) interface still needs to cleanup
  * everything related to the interface and will have purgeulp set to 1.
  */
 static void
 _in6_ifdetach(struct ifnet *ifp, int purgeulp)
 {
 	struct ifaddr *ifa, *next;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 
 	/*
 	 * nuke any of IPv6 addresses we have
 	 */
 	CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		in6_purgeaddr(ifa);
 	}
 	if (purgeulp) {
 		IN6_MULTI_LOCK();
 		in6_pcbpurgeif0(&V_udbinfo, ifp);
 		in6_pcbpurgeif0(&V_ulitecbinfo, ifp);
 		in6_pcbpurgeif0(&V_ripcbinfo, ifp);
 		IN6_MULTI_UNLOCK();
 	}
 	/* leave from all multicast groups joined */
 	in6_purgemaddrs(ifp);
 
 	/*
 	 * Remove neighbor management table.
 	 * Enabling the nd6_purge will panic on vmove for interfaces on VNET
 	 * teardown as the IPv6 layer is cleaned up already and the locks
 	 * are destroyed.
 	 */
 	if (purgeulp)
 		nd6_purge(ifp);
 }
 
 void
 in6_ifdetach(struct ifnet *ifp)
 {
 
 	_in6_ifdetach(ifp, 1);
 }
 
 void
 in6_ifdetach_destroy(struct ifnet *ifp)
 {
 
 	_in6_ifdetach(ifp, 0);
 }
 
 int
 in6_get_tmpifid(struct ifnet *ifp, u_int8_t *retbuf,
     const u_int8_t *baseid, int generate)
 {
 	u_int8_t nullbuf[8];
 	struct nd_ifinfo *ndi = ND_IFINFO(ifp);
 
 	bzero(nullbuf, sizeof(nullbuf));
 	if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) == 0) {
 		/* we've never created a random ID.  Create a new one. */
 		generate = 1;
 	}
 
 	if (generate) {
 		bcopy(baseid, ndi->randomseed1, sizeof(ndi->randomseed1));
 
 		/* generate_tmp_ifid will update seedn and buf */
 		(void)generate_tmp_ifid(ndi->randomseed0, ndi->randomseed1,
 		    ndi->randomid);
 	}
 	bcopy(ndi->randomid, retbuf, 8);
 
 	return (0);
 }
 
 void
 in6_tmpaddrtimer(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct epoch_tracker et;
 	struct nd_ifinfo *ndi;
 	u_int8_t nullbuf[8];
 	struct ifnet *ifp;
 
 	callout_reset(&V_in6_tmpaddrtimer_ch,
 	    (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
 	    V_ip6_temp_regen_advance) * hz, in6_tmpaddrtimer, curvnet);
 
 	bzero(nullbuf, sizeof(nullbuf));
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp->if_afdata[AF_INET6] == NULL)
 			continue;
 		ndi = ND_IFINFO(ifp);
 		if (bcmp(ndi->randomid, nullbuf, sizeof(nullbuf)) != 0) {
 			/*
 			 * We've been generating a random ID on this interface.
 			 * Create a new one.
 			 */
 			(void)generate_tmp_ifid(ndi->randomseed0,
 			    ndi->randomseed1, ndi->randomid);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 static void
 in6_purgemaddrs(struct ifnet *ifp)
 {
 	struct in6_multi_head inmh;
 
 	SLIST_INIT(&inmh);
 	IN6_MULTI_LOCK();
 	IN6_MULTI_LIST_LOCK();
 	mld_ifdetach(ifp, &inmh);
 	IN6_MULTI_LIST_UNLOCK();
 	IN6_MULTI_UNLOCK();
 	in6m_release_list_deferred(&inmh);
 
 	/*
 	 * Make sure all multicast deletions invoking if_ioctl() are
 	 * completed before returning. Else we risk accessing a freed
 	 * ifnet structure pointer.
 	 */
 	in6m_release_wait(NULL);
 }
 
 void
 in6_ifattach_destroy(void)
 {
 
 	callout_drain(&V_in6_tmpaddrtimer_ch);
 }
 
 static void
 in6_ifattach_init(void *dummy)
 {
 
 	/* Timer for regeneranation of temporary addresses randomize ID. */
 	callout_init(&V_in6_tmpaddrtimer_ch, 1);
 	callout_reset(&V_in6_tmpaddrtimer_ch,
 	    (V_ip6_temp_preferred_lifetime - V_ip6_desync_factor -
 	    V_ip6_temp_regen_advance) * hz,
 	    in6_tmpaddrtimer, curvnet);
 }
 
 /*
  * Cheat.
  * This must be after route_init(), which is now SI_ORDER_THIRD.
  */
 SYSINIT(in6_ifattach_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
     in6_ifattach_init, NULL);
diff --git a/sys/netinet6/in6_mcast.c b/sys/netinet6/in6_mcast.c
index 04c4f1aa9d93..4a141d984715 100644
--- a/sys/netinet6/in6_mcast.c
+++ b/sys/netinet6/in6_mcast.c
@@ -1,2950 +1,2951 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2009 Bruce Simpson.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * IPv6 multicast socket, group, and socket option processing module.
  * Normative references: RFC 2292, RFC 3492, RFC 3542, RFC 3678, RFC 3810.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/priv.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/udp.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/scope6_var.h>
 
 #ifndef KTR_MLD
 #define KTR_MLD KTR_INET6
 #endif
 
 #ifndef __SOCKUNION_DECLARED
 union sockunion {
 	struct sockaddr_storage	ss;
 	struct sockaddr		sa;
 	struct sockaddr_dl	sdl;
 	struct sockaddr_in6	sin6;
 };
 typedef union sockunion sockunion_t;
 #define __SOCKUNION_DECLARED
 #endif /* __SOCKUNION_DECLARED */
 
 static MALLOC_DEFINE(M_IN6MFILTER, "in6_mfilter",
     "IPv6 multicast PCB-layer source filter");
 MALLOC_DEFINE(M_IP6MADDR, "in6_multi", "IPv6 multicast group");
 static MALLOC_DEFINE(M_IP6MOPTS, "ip6_moptions", "IPv6 multicast options");
 static MALLOC_DEFINE(M_IP6MSOURCE, "ip6_msource",
     "IPv6 multicast MLD-layer source filter");
 
 RB_GENERATE(ip6_msource_tree, ip6_msource, im6s_link, ip6_msource_cmp);
 
 /*
  * Locking:
  * - Lock order is: Giant, IN6_MULTI_LOCK, INP_WLOCK,
  *   IN6_MULTI_LIST_LOCK, MLD_LOCK, IF_ADDR_LOCK.
  * - The IF_ADDR_LOCK is implicitly taken by in6m_lookup() earlier, however
  *   it can be taken by code in net/if.c also.
  * - ip6_moptions and in6_mfilter are covered by the INP_WLOCK.
  *
  * struct in6_multi is covered by IN6_MULTI_LOCK. There isn't strictly
  * any need for in6_multi itself to be virtualized -- it is bound to an ifp
  * anyway no matter what happens.
  */
 struct mtx in6_multi_list_mtx;
 MTX_SYSINIT(in6_multi_mtx, &in6_multi_list_mtx, "in6_multi_list_mtx", MTX_DEF);
 
 struct mtx in6_multi_free_mtx;
 MTX_SYSINIT(in6_multi_free_mtx, &in6_multi_free_mtx, "in6_multi_free_mtx", MTX_DEF);
 
 struct sx in6_multi_sx;
 SX_SYSINIT(in6_multi_sx, &in6_multi_sx, "in6_multi_sx");
 
 static void	im6f_commit(struct in6_mfilter *);
 static int	im6f_get_source(struct in6_mfilter *imf,
 		    const struct sockaddr_in6 *psin,
 		    struct in6_msource **);
 static struct in6_msource *
 		im6f_graft(struct in6_mfilter *, const uint8_t,
 		    const struct sockaddr_in6 *);
 static void	im6f_leave(struct in6_mfilter *);
 static int	im6f_prune(struct in6_mfilter *, const struct sockaddr_in6 *);
 static void	im6f_purge(struct in6_mfilter *);
 static void	im6f_rollback(struct in6_mfilter *);
 static void	im6f_reap(struct in6_mfilter *);
 static struct in6_mfilter *
 		im6o_match_group(const struct ip6_moptions *,
 		    const struct ifnet *, const struct sockaddr *);
 static struct in6_msource *
 		im6o_match_source(struct in6_mfilter *, const struct sockaddr *);
 static void	im6s_merge(struct ip6_msource *ims,
 		    const struct in6_msource *lims, const int rollback);
 static int	in6_getmulti(struct ifnet *, const struct in6_addr *,
 		    struct in6_multi **);
 static int	in6_joingroup_locked(struct ifnet *, const struct in6_addr *,
 		    struct in6_mfilter *, struct in6_multi **, int);
 static int	in6m_get_source(struct in6_multi *inm,
 		    const struct in6_addr *addr, const int noalloc,
 		    struct ip6_msource **pims);
 #ifdef KTR
 static int	in6m_is_ifp_detached(const struct in6_multi *);
 #endif
 static int	in6m_merge(struct in6_multi *, /*const*/ struct in6_mfilter *);
 static void	in6m_purge(struct in6_multi *);
 static void	in6m_reap(struct in6_multi *);
 static struct ip6_moptions *
 		in6p_findmoptions(struct inpcb *);
 static int	in6p_get_source_filters(struct inpcb *, struct sockopt *);
 static int	in6p_join_group(struct inpcb *, struct sockopt *);
 static int	in6p_leave_group(struct inpcb *, struct sockopt *);
 static struct ifnet *
 		in6p_lookup_mcast_ifp(const struct inpcb *,
 		    const struct sockaddr_in6 *);
 static int	in6p_block_unblock_source(struct inpcb *, struct sockopt *);
 static int	in6p_set_multicast_if(struct inpcb *, struct sockopt *);
 static int	in6p_set_source_filters(struct inpcb *, struct sockopt *);
 static int	sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_DECL(_net_inet6_ip6);	/* XXX Not in any common header. */
 
 static SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, mcast,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPv6 multicast");
 
 static u_long in6_mcast_maxgrpsrc = IPV6_MAX_GROUP_SRC_FILTER;
 SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxgrpsrc,
     CTLFLAG_RWTUN, &in6_mcast_maxgrpsrc, 0,
     "Max source filters per group");
 
 static u_long in6_mcast_maxsocksrc = IPV6_MAX_SOCK_SRC_FILTER;
 SYSCTL_ULONG(_net_inet6_ip6_mcast, OID_AUTO, maxsocksrc,
     CTLFLAG_RWTUN, &in6_mcast_maxsocksrc, 0,
     "Max source filters per socket");
 
 /* TODO Virtualize this switch. */
 int in6_mcast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
 SYSCTL_INT(_net_inet6_ip6_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
     &in6_mcast_loop, 0, "Loopback multicast datagrams by default");
 
 static SYSCTL_NODE(_net_inet6_ip6_mcast, OID_AUTO, filters,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip6_mcast_filters,
     "Per-interface stack-wide source filters");
 
 #ifdef KTR
 /*
  * Inline function which wraps assertions for a valid ifp.
  * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
  * is detached.
  */
 static int __inline
 in6m_is_ifp_detached(const struct in6_multi *inm)
 {
 	struct ifnet *ifp;
 
 	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->in6m_ifma->ifma_ifp;
 	if (ifp != NULL) {
 		/*
 		 * Sanity check that network-layer notion of ifp is the
 		 * same as that of link-layer.
 		 */
 		KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
 	}
 
 	return (ifp == NULL);
 }
 #endif
 
 /*
  * Initialize an in6_mfilter structure to a known state at t0, t1
  * with an empty source filter list.
  */
 static __inline void
 im6f_init(struct in6_mfilter *imf, const int st0, const int st1)
 {
 	memset(imf, 0, sizeof(struct in6_mfilter));
 	RB_INIT(&imf->im6f_sources);
 	imf->im6f_st[0] = st0;
 	imf->im6f_st[1] = st1;
 }
 
 struct in6_mfilter *
 ip6_mfilter_alloc(const int mflags, const int st0, const int st1)
 {
 	struct in6_mfilter *imf;
 
 	imf = malloc(sizeof(*imf), M_IN6MFILTER, mflags);
 
 	if (imf != NULL)
 		im6f_init(imf, st0, st1);
 
 	return (imf);
 }
 
 void
 ip6_mfilter_free(struct in6_mfilter *imf)
 {
 
 	im6f_purge(imf);
 	free(imf, M_IN6MFILTER);
 }
 
 /*
  * Find an IPv6 multicast group entry for this ip6_moptions instance
  * which matches the specified group, and optionally an interface.
  * Return its index into the array, or -1 if not found.
  */
 static struct in6_mfilter *
 im6o_match_group(const struct ip6_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group)
 {
 	const struct sockaddr_in6 *gsin6;
         struct in6_mfilter *imf;
         struct in6_multi *inm;
 
         gsin6 = (const struct sockaddr_in6 *)group;
 
 	IP6_MFILTER_FOREACH(imf, &imo->im6o_head) {
 		inm = imf->im6f_in6m;
 		if (inm == NULL)
 			continue;
 		if ((ifp == NULL || (inm->in6m_ifp == ifp)) &&
 		    IN6_ARE_ADDR_EQUAL(&inm->in6m_addr,
 		    &gsin6->sin6_addr)) {
 			break;
 		}
 	}
 	return (imf);
 }
 
 /*
  * Find an IPv6 multicast source entry for this imo which matches
  * the given group index for this socket, and source address.
  *
  * XXX TODO: The scope ID, if present in src, is stripped before
  * any comparison. We SHOULD enforce scope/zone checks where the source
  * filter entry has a link scope.
  *
  * NOTE: This does not check if the entry is in-mode, merely if
  * it exists, which may not be the desired behaviour.
  */
 static struct in6_msource *
 im6o_match_source(struct in6_mfilter *imf, const struct sockaddr *src)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims;
 	const sockunion_t	*psa;
 
 	KASSERT(src->sa_family == AF_INET6, ("%s: !AF_INET6", __func__));
 
 	psa = (const sockunion_t *)src;
 	find.im6s_addr = psa->sin6.sin6_addr;
 	in6_clearscope(&find.im6s_addr);		/* XXX */
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 
 	return ((struct in6_msource *)ims);
 }
 
 /*
  * Perform filtering for multicast datagrams on a socket by group and source.
  *
  * Returns 0 if a datagram should be allowed through, or various error codes
  * if the socket was not a member of the group, or the source was muted, etc.
  */
 int
 im6o_mc_filter(const struct ip6_moptions *imo, const struct ifnet *ifp,
     const struct sockaddr *group, const struct sockaddr *src)
 {
 	struct in6_mfilter *imf;
 	struct in6_msource *ims;
 	int mode;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 
 	imf = im6o_match_group(imo, ifp, group);
 	if (imf == NULL)
 		return (MCAST_NOTGMEMBER);
 
 	/*
 	 * Check if the source was included in an (S,G) join.
 	 * Allow reception on exclusive memberships by default,
 	 * reject reception on inclusive memberships by default.
 	 * Exclude source only if an in-mode exclude filter exists.
 	 * Include source only if an in-mode include filter exists.
 	 * NOTE: We are comparing group state here at MLD t1 (now)
 	 * with socket-layer t0 (since last downcall).
 	 */
 	mode = imf->im6f_st[1];
 	ims = im6o_match_source(imf, src);
 
 	if ((ims == NULL && mode == MCAST_INCLUDE) ||
 	    (ims != NULL && ims->im6sl_st[0] != mode))
 		return (MCAST_NOTSMEMBER);
 
 	return (MCAST_PASS);
 }
 
 /*
  * Look up an in6_multi record for an IPv6 multicast address
  * on the interface ifp.
  * If no record found, return NULL.
  *
  * SMPng: The IN6_MULTI_LOCK and must be held and must be in network epoch.
  */
 struct in6_multi *
 in6m_lookup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr)
 {
 	struct ifmultiaddr *ifma;
 	struct in6_multi *inm;
 
 	NET_EPOCH_ASSERT();
 
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = in6m_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, mcaddr))
 			return (inm);
 	}
 	return (NULL);
 }
 
 /*
  * Find and return a reference to an in6_multi record for (ifp, group),
  * and bump its reference count.
  * If one does not exist, try to allocate it, and update link-layer multicast
  * filters on ifp to listen for group.
  * Assumes the IN6_MULTI lock is held across the call.
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in6_getmulti(struct ifnet *ifp, const struct in6_addr *group,
     struct in6_multi **pinm)
 {
 	struct epoch_tracker	 et;
 	struct sockaddr_in6	 gsin6;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm;
 	int			 error;
 
 	error = 0;
 
 	/*
 	 * XXX: Accesses to ifma_protospec must be covered by IF_ADDR_LOCK;
 	 * if_addmulti() takes this mutex itself, so we must drop and
 	 * re-acquire around the call.
 	 */
 	IN6_MULTI_LOCK_ASSERT();
 	IN6_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 	NET_EPOCH_ENTER(et);
 	/*
 	 * Does ifp support IPv6 multicasts?
 	 */
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		error = ENODEV;
 	else
 		inm = in6m_lookup_locked(ifp, group);
 	NET_EPOCH_EXIT(et);
 
 	if (error != 0)
 		goto out_locked;
 
 	if (inm != NULL) {
 		/*
 		 * If we already joined this group, just bump the
 		 * refcount and return it.
 		 */
 		KASSERT(inm->in6m_refcount >= 1,
 		    ("%s: bad refcount %d", __func__, inm->in6m_refcount));
 		in6m_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	memset(&gsin6, 0, sizeof(gsin6));
 	gsin6.sin6_family = AF_INET6;
 	gsin6.sin6_len = sizeof(struct sockaddr_in6);
 	gsin6.sin6_addr = *group;
 
 	/*
 	 * Check if a link-layer group is already associated
 	 * with this network-layer group on the given ifnet.
 	 */
 	IN6_MULTI_LIST_UNLOCK();
 	IF_ADDR_WUNLOCK(ifp);
 	error = if_addmulti(ifp, (struct sockaddr *)&gsin6, &ifma);
 	if (error != 0)
 		return (error);
 	IN6_MULTI_LIST_LOCK();
 	IF_ADDR_WLOCK(ifp);
 
 	/*
 	 * If something other than netinet6 is occupying the link-layer
 	 * group, print a meaningful error message and back out of
 	 * the allocation.
 	 * Otherwise, bump the refcount on the existing network-layer
 	 * group association and return it.
 	 */
 	if (ifma->ifma_protospec != NULL) {
 		inm = (struct in6_multi *)ifma->ifma_protospec;
 #ifdef INVARIANTS
 		KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
 		    __func__));
 		KASSERT(ifma->ifma_addr->sa_family == AF_INET6,
 		    ("%s: ifma not AF_INET6", __func__));
 		KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
 		if (inm->in6m_ifma != ifma || inm->in6m_ifp != ifp ||
 		    !IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, group))
 			panic("%s: ifma %p is inconsistent with %p (%p)",
 			    __func__, ifma, inm, group);
 #endif
 		in6m_acquire_locked(inm);
 		*pinm = inm;
 		goto out_locked;
 	}
 
 	IF_ADDR_WLOCK_ASSERT(ifp);
 
 	/*
 	 * A new in6_multi record is needed; allocate and initialize it.
 	 * We DO NOT perform an MLD join as the in6_ layer may need to
 	 * push an initial source list down to MLD to support SSM.
 	 *
 	 * The initial source filter state is INCLUDE, {} as per the RFC.
 	 * Pending state-changes per group are subject to a bounds check.
 	 */
 	inm = malloc(sizeof(*inm), M_IP6MADDR, M_NOWAIT | M_ZERO);
 	if (inm == NULL) {
 		IN6_MULTI_LIST_UNLOCK();
 		IF_ADDR_WUNLOCK(ifp);
 		if_delmulti_ifma(ifma);
 		return (ENOMEM);
 	}
 	inm->in6m_addr = *group;
 	inm->in6m_ifp = ifp;
 	inm->in6m_mli = MLD_IFINFO(ifp);
 	inm->in6m_ifma = ifma;
 	inm->in6m_refcount = 1;
 	inm->in6m_state = MLD_NOT_MEMBER;
 	mbufq_init(&inm->in6m_scq, MLD_MAX_STATE_CHANGES);
 
 	inm->in6m_st[0].iss_fmode = MCAST_UNDEFINED;
 	inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 	RB_INIT(&inm->in6m_srcs);
 
 	ifma->ifma_protospec = inm;
 	*pinm = inm;
 
  out_locked:
 	IN6_MULTI_LIST_UNLOCK();
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 /*
  * Drop a reference to an in6_multi record.
  *
  * If the refcount drops to 0, free the in6_multi record and
  * delete the underlying link-layer membership.
  */
 static void
 in6m_release(struct in6_multi *inm)
 {
 	struct ifmultiaddr *ifma;
 	struct ifnet *ifp;
 
 	CTR2(KTR_MLD, "%s: refcount is %d", __func__, inm->in6m_refcount);
 
 	MPASS(inm->in6m_refcount == 0);
 	CTR2(KTR_MLD, "%s: freeing inm %p", __func__, inm);
 
 	ifma = inm->in6m_ifma;
 	ifp = inm->in6m_ifp;
 	MPASS(ifma->ifma_llifma == NULL);
 
 	/* XXX this access is not covered by IF_ADDR_LOCK */
 	CTR2(KTR_MLD, "%s: purging ifma %p", __func__, ifma);
 	KASSERT(ifma->ifma_protospec == NULL,
 	    ("%s: ifma_protospec != NULL", __func__));
 	if (ifp == NULL)
 		ifp = ifma->ifma_ifp;
 
 	if (ifp != NULL) {
 		CURVNET_SET(ifp->if_vnet);
 		in6m_purge(inm);
 		free(inm, M_IP6MADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 		CURVNET_RESTORE();
 		if_rele(ifp);
 	} else {
 		in6m_purge(inm);
 		free(inm, M_IP6MADDR);
 		if_delmulti_ifma_flags(ifma, 1);
 	}
 }
 
 /*
  * Interface detach can happen in a taskqueue thread context, so we must use a
  * dedicated thread to avoid deadlocks when draining in6m_release tasks.
  */
 TASKQUEUE_DEFINE_THREAD(in6m_free);
 static struct in6_multi_head in6m_free_list = SLIST_HEAD_INITIALIZER();
 static void in6m_release_task(void *arg __unused, int pending __unused);
 static struct task in6m_free_task = TASK_INITIALIZER(0, in6m_release_task, NULL);
 
 void
 in6m_release_list_deferred(struct in6_multi_head *inmh)
 {
 	if (SLIST_EMPTY(inmh))
 		return;
 	mtx_lock(&in6_multi_free_mtx);
 	SLIST_CONCAT(&in6m_free_list, inmh, in6_multi, in6m_nrele);
 	mtx_unlock(&in6_multi_free_mtx);
 	taskqueue_enqueue(taskqueue_in6m_free, &in6m_free_task);
 }
 
 void
 in6m_release_wait(void *arg __unused)
 {
 
 	/*
 	 * Make sure all pending multicast addresses are freed before
 	 * the VNET or network device is destroyed:
 	 */
 	taskqueue_drain_all(taskqueue_in6m_free);
 }
 #ifdef VIMAGE
 /* XXX-BZ FIXME, see D24914. */
 VNET_SYSUNINIT(in6m_release_wait, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, in6m_release_wait, NULL);
 #endif
 
 void
 in6m_disconnect_locked(struct in6_multi_head *inmh, struct in6_multi *inm)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ifa6;
 	struct in6_multi_mship *imm, *imm_tmp;
 	struct ifmultiaddr *ifma, *ll_ifma;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	ifp = inm->in6m_ifp;
 	if (ifp == NULL)
 		return;		/* already called */
 
 	inm->in6m_ifp = NULL;
 	IF_ADDR_WLOCK_ASSERT(ifp);
 	ifma = inm->in6m_ifma;
 	if (ifma == NULL)
 		return;
 
 	if_ref(ifp);
 	if (ifma->ifma_flags & IFMA_F_ENQUEUED) {
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 	}
 	MCDPRINTF("removed ifma: %p from %s\n", ifma, ifp->if_xname);
 	if ((ll_ifma = ifma->ifma_llifma) != NULL) {
 		MPASS(ifma != ll_ifma);
 		ifma->ifma_llifma = NULL;
 		MPASS(ll_ifma->ifma_llifma == NULL);
 		MPASS(ll_ifma->ifma_ifp == ifp);
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
 				CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr, ifma_link);
 				ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 			}
 			MCDPRINTF("removed ll_ifma: %p from %s\n", ll_ifma, ifp->if_xname);
 			if_freemulti(ll_ifma);
 		}
 	}
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ifa6 = (void *)ifa;
 		LIST_FOREACH_SAFE(imm, &ifa6->ia6_memberships,
 		    i6mm_chain, imm_tmp) {
 			if (inm == imm->i6mm_maddr) {
 				LIST_REMOVE(imm, i6mm_chain);
 				free(imm, M_IP6MADDR);
 				in6m_rele_locked(inmh, inm);
 			}
 		}
 	}
 }
 
 static void
 in6m_release_task(void *arg __unused, int pending __unused)
 {
 	struct in6_multi_head in6m_free_tmp;
 	struct in6_multi *inm, *tinm;
 
 	SLIST_INIT(&in6m_free_tmp);
 	mtx_lock(&in6_multi_free_mtx);
 	SLIST_CONCAT(&in6m_free_tmp, &in6m_free_list, in6_multi, in6m_nrele);
 	mtx_unlock(&in6_multi_free_mtx);
 	IN6_MULTI_LOCK();
 	SLIST_FOREACH_SAFE(inm, &in6m_free_tmp, in6m_nrele, tinm) {
 		SLIST_REMOVE_HEAD(&in6m_free_tmp, in6m_nrele);
 		in6m_release(inm);
 	}
 	IN6_MULTI_UNLOCK();
 }
 
 /*
  * Clear recorded source entries for a group.
  * Used by the MLD code. Caller must hold the IN6_MULTI lock.
  * FIXME: Should reap.
  */
 void
 in6m_clear_recorded(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 		if (ims->im6s_stp) {
 			ims->im6s_stp = 0;
 			--inm->in6m_st[1].iss_rec;
 		}
 	}
 	KASSERT(inm->in6m_st[1].iss_rec == 0,
 	    ("%s: iss_rec %d not 0", __func__, inm->in6m_st[1].iss_rec));
 }
 
 /*
  * Record a source as pending for a Source-Group MLDv2 query.
  * This lives here as it modifies the shared tree.
  *
  * inm is the group descriptor.
  * naddr is the address of the source to record in network-byte order.
  *
  * If the net.inet6.mld.sgalloc sysctl is non-zero, we will
  * lazy-allocate a source node in response to an SG query.
  * Otherwise, no allocation is performed. This saves some memory
  * with the trade-off that the source will not be reported to the
  * router if joined in the window between the query response and
  * the group actually being joined on the local host.
  *
  * VIMAGE: XXX: Currently the mld_sgalloc feature has been removed.
  * This turns off the allocation of a recorded source entry if
  * the group has not been joined.
  *
  * Return 0 if the source didn't exist or was already marked as recorded.
  * Return 1 if the source was marked as recorded by this function.
  * Return <0 if any error occurred (negated errno code).
  */
 int
 in6m_record_source(struct in6_multi *inm, const struct in6_addr *addr)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	find.im6s_addr = *addr;
 	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
 	if (ims && ims->im6s_stp)
 		return (0);
 	if (ims == NULL) {
 		if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
 			return (-ENOSPC);
 		nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (-ENOMEM);
 		nims->im6s_addr = find.im6s_addr;
 		RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
 		++inm->in6m_nsrc;
 		ims = nims;
 	}
 
 	/*
 	 * Mark the source as recorded and update the recorded
 	 * source count.
 	 */
 	++ims->im6s_stp;
 	++inm->in6m_st[1].iss_rec;
 
 	return (1);
 }
 
 /*
  * Return a pointer to an in6_msource owned by an in6_mfilter,
  * given its source address.
  * Lazy-allocate if needed. If this is a new entry its filter state is
  * undefined at t0.
  *
  * imf is the filter set being modified.
  * addr is the source address.
  *
  * SMPng: May be called with locks held; malloc must not block.
  */
 static int
 im6f_get_source(struct in6_mfilter *imf, const struct sockaddr_in6 *psin,
     struct in6_msource **plims)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 	struct in6_msource	*lims;
 	int			 error;
 
 	error = 0;
 	ims = NULL;
 	lims = NULL;
 
 	find.im6s_addr = psin->sin6_addr;
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 	lims = (struct in6_msource *)ims;
 	if (lims == NULL) {
 		if (imf->im6f_nsrc == in6_mcast_maxsocksrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		lims = (struct in6_msource *)nims;
 		lims->im6s_addr = find.im6s_addr;
 		lims->im6sl_st[0] = MCAST_UNDEFINED;
 		RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
 		++imf->im6f_nsrc;
 	}
 
 	*plims = lims;
 
 	return (error);
 }
 
 /*
  * Graft a source entry into an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being in the new filter mode at t1.
  *
  * Return the pointer to the new node, otherwise return NULL.
  */
 static struct in6_msource *
 im6f_graft(struct in6_mfilter *imf, const uint8_t st1,
     const struct sockaddr_in6 *psin)
 {
 	struct ip6_msource	*nims;
 	struct in6_msource	*lims;
 
 	nims = malloc(sizeof(struct in6_msource), M_IN6MFILTER,
 	    M_NOWAIT | M_ZERO);
 	if (nims == NULL)
 		return (NULL);
 	lims = (struct in6_msource *)nims;
 	lims->im6s_addr = psin->sin6_addr;
 	lims->im6sl_st[0] = MCAST_UNDEFINED;
 	lims->im6sl_st[1] = st1;
 	RB_INSERT(ip6_msource_tree, &imf->im6f_sources, nims);
 	++imf->im6f_nsrc;
 
 	return (lims);
 }
 
 /*
  * Prune a source entry from an existing socket-layer filter set,
  * maintaining any required invariants and checking allocations.
  *
  * The source is marked as being left at t1, it is not freed.
  *
  * Return 0 if no error occurred, otherwise return an errno value.
  */
 static int
 im6f_prune(struct in6_mfilter *imf, const struct sockaddr_in6 *psin)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	find.im6s_addr = psin->sin6_addr;
 	ims = RB_FIND(ip6_msource_tree, &imf->im6f_sources, &find);
 	if (ims == NULL)
 		return (ENOENT);
 	lims = (struct in6_msource *)ims;
 	lims->im6sl_st[1] = MCAST_UNDEFINED;
 	return (0);
 }
 
 /*
  * Revert socket-layer filter set deltas at t1 to t0 state.
  */
 static void
 im6f_rollback(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == lims->im6sl_st[1]) {
 			/* no change at t1 */
 			continue;
 		} else if (lims->im6sl_st[0] != MCAST_UNDEFINED) {
 			/* revert change to existing source at t1 */
 			lims->im6sl_st[1] = lims->im6sl_st[0];
 		} else {
 			/* revert source added t1 */
 			CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 			RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 			free(ims, M_IN6MFILTER);
 			imf->im6f_nsrc--;
 		}
 	}
 	imf->im6f_st[1] = imf->im6f_st[0];
 }
 
 /*
  * Mark socket-layer filter set as INCLUDE {} at t1.
  */
 static void
 im6f_leave(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		lims->im6sl_st[1] = MCAST_UNDEFINED;
 	}
 	imf->im6f_st[1] = MCAST_INCLUDE;
 }
 
 /*
  * Mark socket-layer filter set deltas as committed.
  */
 static void
 im6f_commit(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		lims->im6sl_st[0] = lims->im6sl_st[1];
 	}
 	imf->im6f_st[0] = imf->im6f_st[1];
 }
 
 /*
  * Reap unreferenced sources from socket-layer filter set.
  */
 static void
 im6f_reap(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 	struct in6_msource	*lims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		lims = (struct in6_msource *)ims;
 		if ((lims->im6sl_st[0] == MCAST_UNDEFINED) &&
 		    (lims->im6sl_st[1] == MCAST_UNDEFINED)) {
 			CTR2(KTR_MLD, "%s: free lims %p", __func__, ims);
 			RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 			free(ims, M_IN6MFILTER);
 			imf->im6f_nsrc--;
 		}
 	}
 }
 
 /*
  * Purge socket-layer filter set.
  */
 static void
 im6f_purge(struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &imf->im6f_sources, tims) {
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &imf->im6f_sources, ims);
 		free(ims, M_IN6MFILTER);
 		imf->im6f_nsrc--;
 	}
 	imf->im6f_st[0] = imf->im6f_st[1] = MCAST_UNDEFINED;
 	KASSERT(RB_EMPTY(&imf->im6f_sources),
 	    ("%s: im6f_sources not empty", __func__));
 }
 
 /*
  * Look up a source filter entry for a multicast group.
  *
  * inm is the group descriptor to work with.
  * addr is the IPv6 address to look up.
  * noalloc may be non-zero to suppress allocation of sources.
  * *pims will be set to the address of the retrieved or allocated source.
  *
  * SMPng: NOTE: may be called with locks held.
  * Return 0 if successful, otherwise return a non-zero error code.
  */
 static int
 in6m_get_source(struct in6_multi *inm, const struct in6_addr *addr,
     const int noalloc, struct ip6_msource **pims)
 {
 	struct ip6_msource	 find;
 	struct ip6_msource	*ims, *nims;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	find.im6s_addr = *addr;
 	ims = RB_FIND(ip6_msource_tree, &inm->in6m_srcs, &find);
 	if (ims == NULL && !noalloc) {
 		if (inm->in6m_nsrc == in6_mcast_maxgrpsrc)
 			return (ENOSPC);
 		nims = malloc(sizeof(struct ip6_msource), M_IP6MSOURCE,
 		    M_NOWAIT | M_ZERO);
 		if (nims == NULL)
 			return (ENOMEM);
 		nims->im6s_addr = *addr;
 		RB_INSERT(ip6_msource_tree, &inm->in6m_srcs, nims);
 		++inm->in6m_nsrc;
 		ims = nims;
 		CTR3(KTR_MLD, "%s: allocated %s as %p", __func__,
 		    ip6_sprintf(ip6tbuf, addr), ims);
 	}
 
 	*pims = ims;
 	return (0);
 }
 
 /*
  * Merge socket-layer source into MLD-layer source.
  * If rollback is non-zero, perform the inverse of the merge.
  */
 static void
 im6s_merge(struct ip6_msource *ims, const struct in6_msource *lims,
     const int rollback)
 {
 	int n = rollback ? -1 : 1;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 
 	ip6_sprintf(ip6tbuf, &lims->im6s_addr);
 #endif
 
 	if (lims->im6sl_st[0] == MCAST_EXCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 ex -= %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].ex -= n;
 	} else if (lims->im6sl_st[0] == MCAST_INCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 in -= %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].in -= n;
 	}
 
 	if (lims->im6sl_st[1] == MCAST_EXCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 ex += %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].ex += n;
 	} else if (lims->im6sl_st[1] == MCAST_INCLUDE) {
 		CTR3(KTR_MLD, "%s: t1 in += %d on %s", __func__, n, ip6tbuf);
 		ims->im6s_st[1].in += n;
 	}
 }
 
 /*
  * Atomically update the global in6_multi state, when a membership's
  * filter list is being updated in any way.
  *
  * imf is the per-inpcb-membership group filter pointer.
  * A fake imf may be passed for in-kernel consumers.
  *
  * XXX This is a candidate for a set-symmetric-difference style loop
  * which would eliminate the repeated lookup from root of ims nodes,
  * as they share the same key space.
  *
  * If any error occurred this function will back out of refcounts
  * and return a non-zero value.
  */
 static int
 in6m_merge(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	struct ip6_msource	*ims, *nims;
 	struct in6_msource	*lims;
 	int			 schanged, error;
 	int			 nsrc0, nsrc1;
 
 	schanged = 0;
 	error = 0;
 	nsrc1 = nsrc0 = 0;
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	/*
 	 * Update the source filters first, as this may fail.
 	 * Maintain count of in-mode filters at t0, t1. These are
 	 * used to work out if we transition into ASM mode or not.
 	 * Maintain a count of source filters whose state was
 	 * actually modified by this operation.
 	 */
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == imf->im6f_st[0]) nsrc0++;
 		if (lims->im6sl_st[1] == imf->im6f_st[1]) nsrc1++;
 		if (lims->im6sl_st[0] == lims->im6sl_st[1]) continue;
 		error = in6m_get_source(inm, &lims->im6s_addr, 0, &nims);
 		++schanged;
 		if (error)
 			break;
 		im6s_merge(nims, lims, 0);
 	}
 	if (error) {
 		struct ip6_msource *bims;
 
 		RB_FOREACH_REVERSE_FROM(ims, ip6_msource_tree, nims) {
 			lims = (struct in6_msource *)ims;
 			if (lims->im6sl_st[0] == lims->im6sl_st[1])
 				continue;
 			(void)in6m_get_source(inm, &lims->im6s_addr, 1, &bims);
 			if (bims == NULL)
 				continue;
 			im6s_merge(bims, lims, 1);
 		}
 		goto out_reap;
 	}
 
 	CTR3(KTR_MLD, "%s: imf filters in-mode: %d at t0, %d at t1",
 	    __func__, nsrc0, nsrc1);
 
 	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
 	if (imf->im6f_st[0] == imf->im6f_st[1] &&
 	    imf->im6f_st[1] == MCAST_INCLUDE) {
 		if (nsrc1 == 0) {
 			CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
 			--inm->in6m_st[1].iss_in;
 		}
 	}
 
 	/* Handle filter mode transition on socket. */
 	if (imf->im6f_st[0] != imf->im6f_st[1]) {
 		CTR3(KTR_MLD, "%s: imf transition %d to %d",
 		    __func__, imf->im6f_st[0], imf->im6f_st[1]);
 
 		if (imf->im6f_st[0] == MCAST_EXCLUDE) {
 			CTR1(KTR_MLD, "%s: --ex on inm at t1", __func__);
 			--inm->in6m_st[1].iss_ex;
 		} else if (imf->im6f_st[0] == MCAST_INCLUDE) {
 			CTR1(KTR_MLD, "%s: --in on inm at t1", __func__);
 			--inm->in6m_st[1].iss_in;
 		}
 
 		if (imf->im6f_st[1] == MCAST_EXCLUDE) {
 			CTR1(KTR_MLD, "%s: ex++ on inm at t1", __func__);
 			inm->in6m_st[1].iss_ex++;
 		} else if (imf->im6f_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
 			CTR1(KTR_MLD, "%s: in++ on inm at t1", __func__);
 			inm->in6m_st[1].iss_in++;
 		}
 	}
 
 	/*
 	 * Track inm filter state in terms of listener counts.
 	 * If there are any exclusive listeners, stack-wide
 	 * membership is exclusive.
 	 * Otherwise, if only inclusive listeners, stack-wide is inclusive.
 	 * If no listeners remain, state is undefined at t1,
 	 * and the MLD lifecycle for this group should finish.
 	 */
 	if (inm->in6m_st[1].iss_ex > 0) {
 		CTR1(KTR_MLD, "%s: transition to EX", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_EXCLUDE;
 	} else if (inm->in6m_st[1].iss_in > 0) {
 		CTR1(KTR_MLD, "%s: transition to IN", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_INCLUDE;
 	} else {
 		CTR1(KTR_MLD, "%s: transition to UNDEF", __func__);
 		inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 	}
 
 	/* Decrement ASM listener count on transition out of ASM mode. */
 	if (imf->im6f_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
 		if ((imf->im6f_st[1] != MCAST_EXCLUDE) ||
 		    (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 > 0)) {
 			CTR1(KTR_MLD, "%s: --asm on inm at t1", __func__);
 			--inm->in6m_st[1].iss_asm;
 		}
 	}
 
 	/* Increment ASM listener count on transition to ASM mode. */
 	if (imf->im6f_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
 		CTR1(KTR_MLD, "%s: asm++ on inm at t1", __func__);
 		inm->in6m_st[1].iss_asm++;
 	}
 
 	CTR3(KTR_MLD, "%s: merged imf %p to inm %p", __func__, imf, inm);
 	in6m_print(inm);
 
 out_reap:
 	if (schanged > 0) {
 		CTR1(KTR_MLD, "%s: sources changed; reaping", __func__);
 		in6m_reap(inm);
 	}
 	return (error);
 }
 
 /*
  * Mark an in6_multi's filter set deltas as committed.
  * Called by MLD after a state change has been enqueued.
  */
 void
 in6m_commit(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims;
 
 	CTR2(KTR_MLD, "%s: commit inm %p", __func__, inm);
 	CTR1(KTR_MLD, "%s: pre commit:", __func__);
 	in6m_print(inm);
 
 	RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 		ims->im6s_st[0] = ims->im6s_st[1];
 	}
 	inm->in6m_st[0] = inm->in6m_st[1];
 }
 
 /*
  * Reap unreferenced nodes from an in6_multi's filter set.
  */
 static void
 in6m_reap(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
 		if (ims->im6s_st[0].ex > 0 || ims->im6s_st[0].in > 0 ||
 		    ims->im6s_st[1].ex > 0 || ims->im6s_st[1].in > 0 ||
 		    ims->im6s_stp != 0)
 			continue;
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
 		free(ims, M_IP6MSOURCE);
 		inm->in6m_nsrc--;
 	}
 }
 
 /*
  * Purge all source nodes from an in6_multi's filter set.
  */
 static void
 in6m_purge(struct in6_multi *inm)
 {
 	struct ip6_msource	*ims, *tims;
 
 	RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs, tims) {
 		CTR2(KTR_MLD, "%s: free ims %p", __func__, ims);
 		RB_REMOVE(ip6_msource_tree, &inm->in6m_srcs, ims);
 		free(ims, M_IP6MSOURCE);
 		inm->in6m_nsrc--;
 	}
 	/* Free state-change requests that might be queued. */
 	mbufq_drain(&inm->in6m_scq);
 }
 
 /*
  * Join a multicast address w/o sources.
  * KAME compatibility entry point.
  *
  * SMPng: Assume no mc locks held by caller.
  */
 int
 in6_joingroup(struct ifnet *ifp, const struct in6_addr *mcaddr,
     /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
     const int delay)
 {
 	int error;
 
 	IN6_MULTI_LOCK();
 	error = in6_joingroup_locked(ifp, mcaddr, NULL, pinm, delay);
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Join a multicast group; real entry point.
  *
  * Only preserves atomicity at inm level.
  * NOTE: imf argument cannot be const due to sys/tree.h limitations.
  *
  * If the MLD downcall fails, the group is not joined, and an error
  * code is returned.
  */
 static int
 in6_joingroup_locked(struct ifnet *ifp, const struct in6_addr *mcaddr,
     /*const*/ struct in6_mfilter *imf, struct in6_multi **pinm,
     const int delay)
 {
 	struct in6_multi_head    inmh;
 	struct in6_mfilter	 timf;
 	struct in6_multi	*inm;
 	struct ifmultiaddr *ifma;
 	int			 error;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	/*
 	 * Sanity: Check scope zone ID was set for ifp, if and
 	 * only if group is scoped to an interface.
 	 */
 	KASSERT(IN6_IS_ADDR_MULTICAST(mcaddr),
 	    ("%s: not a multicast address", __func__));
 	if (IN6_IS_ADDR_MC_LINKLOCAL(mcaddr) ||
 	    IN6_IS_ADDR_MC_INTFACELOCAL(mcaddr)) {
 		KASSERT(mcaddr->s6_addr16[1] != 0,
 		    ("%s: scope zone ID not set", __func__));
 	}
 
 	IN6_MULTI_LOCK_ASSERT();
 	IN6_MULTI_LIST_UNLOCK_ASSERT();
 
 	CTR4(KTR_MLD, "%s: join %s on %p(%s))", __func__,
 	    ip6_sprintf(ip6tbuf, mcaddr), ifp, if_name(ifp));
 
 	error = 0;
 	inm = NULL;
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		im6f_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
 		imf = &timf;
 	}
 	error = in6_getmulti(ifp, mcaddr, &inm);
 	if (error) {
 		CTR1(KTR_MLD, "%s: in6_getmulti() failure", __func__);
 		return (error);
 	}
 
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error) {
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 		goto out_in6m_release;
 	}
 
 	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 	error = mld_change_state(inm, delay);
 	if (error) {
 		CTR1(KTR_MLD, "%s: failed to update source", __func__);
 		goto out_in6m_release;
 	}
 
 out_in6m_release:
 	SLIST_INIT(&inmh);
 	if (error) {
 		struct epoch_tracker et;
 
 		CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
 		IF_ADDR_WLOCK(ifp);
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (ifma->ifma_protospec == inm) {
 				ifma->ifma_protospec = NULL;
 				break;
 			}
 		}
 		in6m_disconnect_locked(&inmh, inm);
 		in6m_rele_locked(&inmh, inm);
 		NET_EPOCH_EXIT(et);
 		IF_ADDR_WUNLOCK(ifp);
 	} else {
 		*pinm = inm;
 	}
 	IN6_MULTI_LIST_UNLOCK();
 	in6m_release_list_deferred(&inmh);
 	return (error);
 }
 
 /*
  * Leave a multicast group; unlocked entry point.
  */
 int
 in6_leavegroup(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	int error;
 
 	IN6_MULTI_LOCK();
 	error = in6_leavegroup_locked(inm, imf);
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Leave a multicast group; real entry point.
  * All source filters will be expunged.
  *
  * Only preserves atomicity at inm level.
  *
  * Holding the write lock for the INP which contains imf
  * is highly advisable. We can't assert for it as imf does not
  * contain a back-pointer to the owning inp.
  *
  * Note: This is not the same as in6m_release(*) as this function also
  * makes a state change downcall into MLD.
  */
 int
 in6_leavegroup_locked(struct in6_multi *inm, /*const*/ struct in6_mfilter *imf)
 {
 	struct in6_multi_head	 inmh;
 	struct in6_mfilter	 timf;
 	struct ifnet *ifp;
 	int			 error;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	error = 0;
 
 	IN6_MULTI_LOCK_ASSERT();
 
 	CTR5(KTR_MLD, "%s: leave inm %p, %s/%s, imf %p", __func__,
 	    inm, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    (in6m_is_ifp_detached(inm) ? "null" : if_name(inm->in6m_ifp)),
 	    imf);
 
 	/*
 	 * If no imf was specified (i.e. kernel consumer),
 	 * fake one up and assume it is an ASM join.
 	 */
 	if (imf == NULL) {
 		im6f_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
 		imf = &timf;
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 *
 	 * As this particular invocation should not cause any memory
 	 * to be allocated, and there is no opportunity to roll back
 	 * the transaction, it MUST NOT fail.
 	 */
 
 	ifp = inm->in6m_ifp;
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
 
 	CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 	error = 0;
 	if (ifp)
 		error = mld_change_state(inm, 0);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 
 	CTR2(KTR_MLD, "%s: dropping ref on %p", __func__, inm);
 	if (ifp)
 		IF_ADDR_WLOCK(ifp);
 
 	SLIST_INIT(&inmh);
 	if (inm->in6m_refcount == 1)
 		in6m_disconnect_locked(&inmh, inm);
 	in6m_rele_locked(&inmh, inm);
 	if (ifp)
 		IF_ADDR_WUNLOCK(ifp);
 	IN6_MULTI_LIST_UNLOCK();
 	in6m_release_list_deferred(&inmh);
 	return (error);
 }
 
 /*
  * Block or unblock an ASM multicast source on an inpcb.
  * This implements the delta-based API described in RFC 3678.
  *
  * The delta-based API applies only to exclusive-mode memberships.
  * An MLD downcall will be performed.
  *
  * SMPng: NOTE: Must take Giant as a join may create a new ifma.
  *
  * Return 0 if successful, otherwise return an appropriate error code.
  */
 static int
 in6p_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct group_source_req		 gsr;
 	struct epoch_tracker		 et;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_msource		*ims;
 	struct in6_multi			*inm;
 	uint16_t			 fmode;
 	int				 error, doblock;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	ifp = NULL;
 	error = 0;
 	doblock = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 
 	switch (sopt->sopt_name) {
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = sooptcopyin(sopt, &gsr,
 		    sizeof(struct group_source_req),
 		    sizeof(struct group_source_req));
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (ssa->sin6.sin6_family != AF_INET6 ||
 		    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		/*
 		 * XXXGL: this function should use ifnet_byindex_ref, or
 		 * expand the epoch section all the way to where we put
 		 * the reference.
 		 */
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		NET_EPOCH_EXIT(et);
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 
 		if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
 			doblock = 1;
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	/*
 	 * Check if we are actually a member of this group.
 	 */
 	imo = in6p_findmoptions(inp);
 	imf = im6o_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 	inm = imf->im6f_in6m;
 
 	/*
 	 * Attempting to use the delta-based API on an
 	 * non exclusive-mode membership is an error.
 	 */
 	fmode = imf->im6f_st[0];
 	if (fmode != MCAST_EXCLUDE) {
 		error = EINVAL;
 		goto out_in6p_locked;
 	}
 
 	/*
 	 * Deal with error cases up-front:
 	 *  Asked to block, but already blocked; or
 	 *  Asked to unblock, but nothing to unblock.
 	 * If adding a new block entry, allocate it.
 	 */
 	ims = im6o_match_source(imf, &ssa->sa);
 	if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
 		CTR3(KTR_MLD, "%s: source %s %spresent", __func__,
 		    ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
 		    doblock ? "" : "not ");
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	if (doblock) {
 		CTR2(KTR_MLD, "%s: %s source", __func__, "block");
 		ims = im6f_graft(imf, fmode, &ssa->sin6);
 		if (ims == NULL)
 			error = ENOMEM;
 	} else {
 		CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
 		error = im6f_prune(imf, &ssa->sin6);
 	}
 
 	if (error) {
 		CTR1(KTR_MLD, "%s: merge imf state failed", __func__);
 		goto out_im6f_rollback;
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	IN6_MULTI_LIST_LOCK();
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 	else {
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 	}
 
 	IN6_MULTI_LIST_UNLOCK();
 
 out_im6f_rollback:
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Given an inpcb, return its multicast options structure pointer.  Accepts
  * an unlocked inpcb pointer, but will return it locked.  May sleep.
  *
  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
  * SMPng: NOTE: Returns with the INP write lock held.
  */
 static struct ip6_moptions *
 in6p_findmoptions(struct inpcb *inp)
 {
 	struct ip6_moptions	 *imo;
 
 	INP_WLOCK(inp);
 	if (inp->in6p_moptions != NULL)
 		return (inp->in6p_moptions);
 
 	INP_WUNLOCK(inp);
 
 	imo = malloc(sizeof(*imo), M_IP6MOPTS, M_WAITOK);
 
 	imo->im6o_multicast_ifp = NULL;
 	imo->im6o_multicast_hlim = V_ip6_defmcasthlim;
 	imo->im6o_multicast_loop = in6_mcast_loop;
 	STAILQ_INIT(&imo->im6o_head);
 
 	INP_WLOCK(inp);
 	if (inp->in6p_moptions != NULL) {
 		free(imo, M_IP6MOPTS);
 		return (inp->in6p_moptions);
 	}
 	inp->in6p_moptions = imo;
 	return (imo);
 }
 
 /*
  * Discard the IPv6 multicast options (and source filters).
  *
  * SMPng: NOTE: assumes INP write lock is held.
  *
  * XXX can all be safely deferred to epoch_call
  *
  */
 
 static void
 inp_gcmoptions(struct ip6_moptions *imo)
 {
 	struct in6_mfilter *imf;
 	struct in6_multi *inm;
 	struct ifnet *ifp;
 
 	while ((imf = ip6_mfilter_first(&imo->im6o_head)) != NULL) {
                 ip6_mfilter_remove(&imo->im6o_head, imf);
 
                 im6f_leave(imf);
                 if ((inm = imf->im6f_in6m) != NULL) {
                         if ((ifp = inm->in6m_ifp) != NULL) {
                                 CURVNET_SET(ifp->if_vnet);
                                 (void)in6_leavegroup(inm, imf);
                                 CURVNET_RESTORE();
                         } else {
                                 (void)in6_leavegroup(inm, imf);
                         }
                 }
                 ip6_mfilter_free(imf);
         }
         free(imo, M_IP6MOPTS);
 }
 
 void
 ip6_freemoptions(struct ip6_moptions *imo)
 {
 	if (imo == NULL)
 		return;
 	inp_gcmoptions(imo);
 }
 
 /*
  * Atomically get source filters on a socket for an IPv6 multicast group.
  * Called with INP lock held; returns with lock released.
  */
 static int
 in6p_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker	 et;
 	struct __msfilterreq	 msfr;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct ip6_moptions	*imo;
 	struct in6_mfilter	*imf;
 	struct ip6_msource	*ims;
 	struct in6_msource	*lims;
 	struct sockaddr_in6	*psin;
 	struct sockaddr_storage	*ptss;
 	struct sockaddr_storage	*tss;
 	int			 error;
 	size_t			 nsrcs, ncsrcs;
 
 	INP_WLOCK_ASSERT(inp);
 
 	imo = inp->in6p_moptions;
 	KASSERT(imo != NULL, ("%s: null ip6_moptions", __func__));
 
 	INP_WUNLOCK(inp);
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_group.ss_family != AF_INET6 ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	/*
 	 * XXXGL: this function should use ifnet_byindex_ref, or expand the
 	 * epoch section all the way to where the interface is referenced.
 	 */
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	NET_EPOCH_EXIT(et);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	INP_WLOCK(inp);
 
 	/*
 	 * Lookup group on the socket.
 	 */
 	imf = im6o_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		INP_WUNLOCK(inp);
 		return (EADDRNOTAVAIL);
 	}
 
 	/*
 	 * Ignore memberships which are in limbo.
 	 */
 	if (imf->im6f_st[1] == MCAST_UNDEFINED) {
 		INP_WUNLOCK(inp);
 		return (EAGAIN);
 	}
 	msfr.msfr_fmode = imf->im6f_st[1];
 
 	/*
 	 * If the user specified a buffer, copy out the source filter
 	 * entries to userland gracefully.
 	 * We only copy out the number of entries which userland
 	 * has asked for, but we always tell userland how big the
 	 * buffer really needs to be.
 	 */
 	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
 		msfr.msfr_nsrcs = in6_mcast_maxsocksrc;
 	tss = NULL;
 	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
 		tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_NOWAIT | M_ZERO);
 		if (tss == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Count number of sources in-mode at t0.
 	 * If buffer space exists and remains, copy out source entries.
 	 */
 	nsrcs = msfr.msfr_nsrcs;
 	ncsrcs = 0;
 	ptss = tss;
 	RB_FOREACH(ims, ip6_msource_tree, &imf->im6f_sources) {
 		lims = (struct in6_msource *)ims;
 		if (lims->im6sl_st[0] == MCAST_UNDEFINED ||
 		    lims->im6sl_st[0] != imf->im6f_st[0])
 			continue;
 		++ncsrcs;
 		if (tss != NULL && nsrcs > 0) {
 			psin = (struct sockaddr_in6 *)ptss;
 			psin->sin6_family = AF_INET6;
 			psin->sin6_len = sizeof(struct sockaddr_in6);
 			psin->sin6_addr = lims->im6s_addr;
 			psin->sin6_port = 0;
 			--nsrcs;
 			++ptss;
 		}
 	}
 
 	INP_WUNLOCK(inp);
 
 	if (tss != NULL) {
 		error = copyout(tss, msfr.msfr_srcs,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		free(tss, M_TEMP);
 		if (error)
 			return (error);
 	}
 
 	msfr.msfr_nsrcs = ncsrcs;
 	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
 
 	return (error);
 }
 
 /*
  * Return the IP multicast options in response to user getsockopt().
  */
 int
 ip6_getmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip6_moptions	*im6o;
 	int			 error;
 	u_int			 optval;
 
 	INP_WLOCK(inp);
 	im6o = inp->in6p_moptions;
 	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */
 	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = 0;
 	switch (sopt->sopt_name) {
 	case IPV6_MULTICAST_IF:
 		if (im6o == NULL || im6o->im6o_multicast_ifp == NULL) {
 			optval = 0;
 		} else {
 			optval = im6o->im6o_multicast_ifp->if_index;
 		}
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MULTICAST_HOPS:
 		if (im6o == NULL)
 			optval = V_ip6_defmcasthlim;
 		else
 			optval = im6o->im6o_multicast_hlim;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MULTICAST_LOOP:
 		if (im6o == NULL)
 			optval = in6_mcast_loop; /* XXX VIMAGE */
 		else
 			optval = im6o->im6o_multicast_loop;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &optval, sizeof(u_int));
 		break;
 
 	case IPV6_MSFILTER:
 		if (im6o == NULL) {
 			error = EADDRNOTAVAIL;
 			INP_WUNLOCK(inp);
 		} else {
 			error = in6p_get_source_filters(inp, sopt);
 		}
 		break;
 
 	default:
 		INP_WUNLOCK(inp);
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Look up the ifnet to use for a multicast group membership,
  * given the address of an IPv6 group.
  *
  * This routine exists to support legacy IPv6 multicast applications.
  *
  * Use the socket's current FIB number for any required FIB lookup. Look up the
  * group address in the unicast FIB, and use its ifp; usually, this points to
  * the default next-hop.  If the FIB lookup fails, return NULL.
  *
  * FUTURE: Support multiple forwarding tables for IPv6.
  *
  * Returns NULL if no ifp could be found.
  */
 static struct ifnet *
 in6p_lookup_mcast_ifp(const struct inpcb *inp, const struct sockaddr_in6 *gsin6)
 {
 	struct nhop_object	*nh;
 	struct in6_addr		dst;
 	uint32_t		scopeid;
 	uint32_t		fibnum;
 
 	KASSERT(gsin6->sin6_family == AF_INET6,
 	    ("%s: not AF_INET6 group", __func__));
 
 	in6_splitscope(&gsin6->sin6_addr, &dst, &scopeid);
 	fibnum = inp->inp_inc.inc_fibnum;
 	nh = fib6_lookup(fibnum, &dst, scopeid, 0, 0);
 
 	return (nh ? nh->nh_ifp : NULL);
 }
 
 /*
  * Join an IPv6 multicast group, possibly with a source.
  *
  * FIXME: The KAME use of the unspecified address (::)
  * to join *all* multicast groups is currently unsupported.
  *
  * XXXGL: this function multiple times uses ifnet_byindex() without
  * proper protection - staying in epoch, or putting reference on ifnet.
  */
 static int
 in6p_join_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct in6_multi_head		 inmh;
 	struct group_source_req		 gsr;
 	struct epoch_tracker		 et;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_multi		*inm;
 	struct in6_msource		*lims;
 	int				 error, is_new;
 
 	SLIST_INIT(&inmh);
 	ifp = NULL;
 	lims = NULL;
 	error = 0;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	/*
 	 * Chew everything into struct group_source_req.
 	 * Overwrite the port field if present, as the sockaddr
 	 * being copied in may be matched with a binary comparison.
 	 * Ignore passed-in scope ID.
 	 */
 	switch (sopt->sopt_name) {
 	case IPV6_JOIN_GROUP: {
 		struct ipv6_mreq mreq;
 
 		error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
 		    sizeof(struct ipv6_mreq));
 		if (error)
 			return (error);
 
 		gsa->sin6.sin6_family = AF_INET6;
 		gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
 
 		if (mreq.ipv6mr_interface == 0) {
 			ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
 		} else {
 			NET_EPOCH_ENTER(et);
 			ifp = ifnet_byindex(mreq.ipv6mr_interface);
 			NET_EPOCH_EXIT(et);
 			if (ifp == NULL)
 				return (EADDRNOTAVAIL);
 		}
 		CTR3(KTR_MLD, "%s: ipv6mr_interface = %d, ifp = %p",
 		    __func__, mreq.ipv6mr_interface, ifp);
 	} break;
 
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_JOIN_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 
 		if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
 			if (ssa->sin6.sin6_family != AF_INET6 ||
 			    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 			if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
 				return (EINVAL);
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&ssa->sin6.sin6_addr);
 			ssa->sin6.sin6_port = 0;
 			ssa->sin6.sin6_scope_id = 0;
 		}
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex(gsr.gsr_interface);
 		NET_EPOCH_EXIT(et);
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
 		return (EADDRNOTAVAIL);
 
 	gsa->sin6.sin6_port = 0;
 	gsa->sin6.sin6_scope_id = 0;
 
 	/*
 	 * Always set the scope zone ID on memberships created from userland.
 	 * Use the passed-in ifp to do this.
 	 * XXX The in6_setscope() return value is meaningless.
 	 * XXX SCOPE6_LOCK() is taken by in6_setscope().
 	 */
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	IN6_MULTI_LOCK();
 
 	/*
 	 * Find the membership in the membership list.
 	 */
 	imo = in6p_findmoptions(inp);
 	imf = im6o_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		is_new = 1;
 		inm = NULL;
 
 		if (ip6_mfilter_count(&imo->im6o_head) >= IPV6_MAX_MEMBERSHIPS) {
 			error = ENOMEM;
 			goto out_in6p_locked;
 		}
 	} else {
 		is_new = 0;
 		inm = imf->im6f_in6m;
 
 		if (ssa->ss.ss_family != AF_UNSPEC) {
 			/*
 			 * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
 			 * is an error. On an existing inclusive membership,
 			 * it just adds the source to the filter list.
 			 */
 			if (imf->im6f_st[1] != MCAST_INCLUDE) {
 				error = EINVAL;
 				goto out_in6p_locked;
 			}
 			/*
 			 * Throw out duplicates.
 			 *
 			 * XXX FIXME: This makes a naive assumption that
 			 * even if entries exist for *ssa in this imf,
 			 * they will be rejected as dupes, even if they
 			 * are not valid in the current mode (in-mode).
 			 *
 			 * in6_msource is transactioned just as for anything
 			 * else in SSM -- but note naive use of in6m_graft()
 			 * below for allocating new filter entries.
 			 *
 			 * This is only an issue if someone mixes the
 			 * full-state SSM API with the delta-based API,
 			 * which is discouraged in the relevant RFCs.
 			 */
 			lims = im6o_match_source(imf, &ssa->sa);
 			if (lims != NULL /*&&
 			    lims->im6sl_st[1] == MCAST_INCLUDE*/) {
 				error = EADDRNOTAVAIL;
 				goto out_in6p_locked;
 			}
 		} else {
 			/*
 			 * MCAST_JOIN_GROUP alone, on any existing membership,
 			 * is rejected, to stop the same inpcb tying up
 			 * multiple refs to the in_multi.
 			 * On an existing inclusive membership, this is also
 			 * an error; if you want to change filter mode,
 			 * you must use the userland API setsourcefilter().
 			 * XXX We don't reject this for imf in UNDEFINED
 			 * state at t1, because allocation of a filter
 			 * is atomic with allocation of a membership.
 			 */
 			error = EADDRINUSE;
 			goto out_in6p_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Graft new source into filter list for this inpcb's
 	 * membership of the group. The in6_multi may not have
 	 * been allocated yet if this is a new membership, however,
 	 * the in_mfilter slot will be allocated and must be initialized.
 	 *
 	 * Note: Grafting of exclusive mode filters doesn't happen
 	 * in this path.
 	 * XXX: Should check for non-NULL lims (node exists but may
 	 * not be in-mode) for interop with full-state API.
 	 */
 	if (ssa->ss.ss_family != AF_UNSPEC) {
 		/* Membership starts in IN mode */
 		if (is_new) {
 			CTR1(KTR_MLD, "%s: new join w/source", __func__);
 			imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_INCLUDE);
 			if (imf == NULL) {
 				error = ENOMEM;
 				goto out_in6p_locked;
 			}
 		} else {
 			CTR2(KTR_MLD, "%s: %s source", __func__, "allow");
 		}
 		lims = im6f_graft(imf, MCAST_INCLUDE, &ssa->sin6);
 		if (lims == NULL) {
 			CTR1(KTR_MLD, "%s: merge imf state failed",
 			    __func__);
 			error = ENOMEM;
 			goto out_in6p_locked;
 		}
 	} else {
 		/* No address specified; Membership starts in EX mode */
 		if (is_new) {
 			CTR1(KTR_MLD, "%s: new join w/o source", __func__);
 			imf = ip6_mfilter_alloc(M_NOWAIT, MCAST_UNDEFINED, MCAST_EXCLUDE);
 			if (imf == NULL) {
 				error = ENOMEM;
 				goto out_in6p_locked;
 			}
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	if (is_new) {
 		in_pcbref(inp);
 		INP_WUNLOCK(inp);
 
 		error = in6_joingroup_locked(ifp, &gsa->sin6.sin6_addr, imf,
 		    &imf->im6f_in6m, 0);
 
 		INP_WLOCK(inp);
 		if (in_pcbrele_wlocked(inp)) {
 			error = ENXIO;
 			goto out_in6p_unlocked;
 		}
 		if (error) {
 			goto out_in6p_locked;
 		}
 		/*
 		 * NOTE: Refcount from in6_joingroup_locked()
 		 * is protecting membership.
 		 */
 		ip6_mfilter_insert(&imo->im6o_head, imf);
 	} else {
 		CTR1(KTR_MLD, "%s: merge inm state", __func__);
 		IN6_MULTI_LIST_LOCK();
 		error = in6m_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_MLD, "%s: failed to merge inm state",
 			    __func__);
 			IN6_MULTI_LIST_UNLOCK();
 			im6f_rollback(imf);
 			im6f_reap(imf);
 			goto out_in6p_locked;
 		}
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		IN6_MULTI_LIST_UNLOCK();
 
 		if (error) {
 			CTR1(KTR_MLD, "%s: failed mld downcall",
 			     __func__);
 			im6f_rollback(imf);
 			im6f_reap(imf);
 			goto out_in6p_locked;
 		}
 	}
 
 	im6f_commit(imf);
 	imf = NULL;
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 out_in6p_unlocked:
 	IN6_MULTI_UNLOCK();
 
 	if (is_new && imf) {
 		if (imf->im6f_in6m != NULL) {
 			struct in6_multi_head inmh;
 
 			SLIST_INIT(&inmh);
 			SLIST_INSERT_HEAD(&inmh, imf->im6f_in6m, in6m_defer);
 			in6m_release_list_deferred(&inmh);
 		}
 		ip6_mfilter_free(imf);
 	}
 	return (error);
 }
 
 /*
  * Leave an IPv6 multicast group on an inpcb, possibly with a source.
  */
 static int
 in6p_leave_group(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ipv6_mreq		 mreq;
 	struct group_source_req		 gsr;
 	struct epoch_tracker		 et;
 	sockunion_t			*gsa, *ssa;
 	struct ifnet			*ifp;
 	struct in6_mfilter		*imf;
 	struct ip6_moptions		*imo;
 	struct in6_msource		*ims;
 	struct in6_multi		*inm;
 	uint32_t			 ifindex;
 	int				 error;
 	bool				 is_final;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	ifp = NULL;
 	ifindex = 0;
 	error = 0;
 	is_final = true;
 
 	memset(&gsr, 0, sizeof(struct group_source_req));
 	gsa = (sockunion_t *)&gsr.gsr_group;
 	gsa->ss.ss_family = AF_UNSPEC;
 	ssa = (sockunion_t *)&gsr.gsr_source;
 	ssa->ss.ss_family = AF_UNSPEC;
 
 	/*
 	 * Chew everything passed in up into a struct group_source_req
 	 * as that is easier to process.
 	 * Note: Any embedded scope ID in the multicast group passed
 	 * in by userland is ignored, the interface index is the recommended
 	 * mechanism to specify an interface; see below.
 	 */
 	switch (sopt->sopt_name) {
 	case IPV6_LEAVE_GROUP:
 		error = sooptcopyin(sopt, &mreq, sizeof(struct ipv6_mreq),
 		    sizeof(struct ipv6_mreq));
 		if (error)
 			return (error);
 		gsa->sin6.sin6_family = AF_INET6;
 		gsa->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		gsa->sin6.sin6_addr = mreq.ipv6mr_multiaddr;
 		gsa->sin6.sin6_port = 0;
 		gsa->sin6.sin6_scope_id = 0;
 		ifindex = mreq.ipv6mr_interface;
 		break;
 
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_req),
 			    sizeof(struct group_req));
 		} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			error = sooptcopyin(sopt, &gsr,
 			    sizeof(struct group_source_req),
 			    sizeof(struct group_source_req));
 		}
 		if (error)
 			return (error);
 
 		if (gsa->sin6.sin6_family != AF_INET6 ||
 		    gsa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
 			if (ssa->sin6.sin6_family != AF_INET6 ||
 			    ssa->sin6.sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 			if (IN6_IS_ADDR_MULTICAST(&ssa->sin6.sin6_addr))
 				return (EINVAL);
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&ssa->sin6.sin6_addr);
 		}
 		gsa->sin6.sin6_port = 0;
 		gsa->sin6.sin6_scope_id = 0;
 		ifindex = gsr.gsr_interface;
 		break;
 
 	default:
 		CTR2(KTR_MLD, "%s: unknown sopt_name %d",
 		    __func__, sopt->sopt_name);
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	/*
 	 * Validate interface index if provided. If no interface index
 	 * was provided separately, attempt to look the membership up
 	 * from the default scope as a last resort to disambiguate
 	 * the membership we are being asked to leave.
 	 * XXX SCOPE6 lock potentially taken here.
 	 */
 	if (ifindex != 0) {
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex(ifindex);
 		NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 		(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 	} else {
 		error = sa6_embedscope(&gsa->sin6, V_ip6_use_defzone);
 		if (error)
 			return (EADDRNOTAVAIL);
 		/*
 		 * Some badly behaved applications don't pass an ifindex
 		 * or a scope ID, which is an API violation. In this case,
 		 * perform a lookup as per a v6 join.
 		 *
 		 * XXX For now, stomp on zone ID for the corner case.
 		 * This is not the 'KAME way', but we need to see the ifp
 		 * directly until such time as this implementation is
 		 * refactored, assuming the scope IDs are the way to go.
 		 */
 		ifindex = ntohs(gsa->sin6.sin6_addr.s6_addr16[1]);
 		if (ifindex == 0) {
 			CTR2(KTR_MLD, "%s: warning: no ifindex, looking up "
 			    "ifp for group %s.", __func__,
 			    ip6_sprintf(ip6tbuf, &gsa->sin6.sin6_addr));
 			ifp = in6p_lookup_mcast_ifp(inp, &gsa->sin6);
 		} else {
 			NET_EPOCH_ENTER(et);
 			ifp = ifnet_byindex(ifindex);
 			NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 		}
 		if (ifp == NULL)
 			return (EADDRNOTAVAIL);
 	}
 
 	CTR2(KTR_MLD, "%s: ifp = %p", __func__, ifp);
 	KASSERT(ifp != NULL, ("%s: ifp did not resolve", __func__));
 
 	IN6_MULTI_LOCK();
 
 	/*
 	 * Find the membership in the membership list.
 	 */
 	imo = in6p_findmoptions(inp);
 	imf = im6o_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 	inm = imf->im6f_in6m;
 
 	if (ssa->ss.ss_family != AF_UNSPEC)
 		is_final = false;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * If we were instructed only to leave a given source, do so.
 	 * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
 	 */
 	if (is_final) {
 		ip6_mfilter_remove(&imo->im6o_head, imf);
 		im6f_leave(imf);
 
 		/*
 		 * Give up the multicast address record to which
 		 * the membership points.
 		 */
 		(void)in6_leavegroup_locked(inm, imf);
 	} else {
 		if (imf->im6f_st[0] == MCAST_EXCLUDE) {
 			error = EADDRNOTAVAIL;
 			goto out_in6p_locked;
 		}
 		ims = im6o_match_source(imf, &ssa->sa);
 		if (ims == NULL) {
 			CTR3(KTR_MLD, "%s: source %p %spresent", __func__,
 			    ip6_sprintf(ip6tbuf, &ssa->sin6.sin6_addr),
 			    "not ");
 			error = EADDRNOTAVAIL;
 			goto out_in6p_locked;
 		}
 		CTR2(KTR_MLD, "%s: %s source", __func__, "block");
 		error = im6f_prune(imf, &ssa->sin6);
 		if (error) {
 			CTR1(KTR_MLD, "%s: merge imf state failed",
 			    __func__);
 			goto out_in6p_locked;
 		}
 	}
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	if (!is_final) {
 		CTR1(KTR_MLD, "%s: merge inm state", __func__);
 		IN6_MULTI_LIST_LOCK();
 		error = in6m_merge(inm, imf);
 		if (error) {
 			CTR1(KTR_MLD, "%s: failed to merge inm state",
 			    __func__);
 			IN6_MULTI_LIST_UNLOCK();
 			im6f_rollback(imf);
 			im6f_reap(imf);
                         goto out_in6p_locked;
 		}
 
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		IN6_MULTI_LIST_UNLOCK();
 		if (error) {
 			CTR1(KTR_MLD, "%s: failed mld downcall",
 			     __func__);
 			im6f_rollback(imf);
 			im6f_reap(imf);
                         goto out_in6p_locked;
 		}
 	}
 
 	im6f_commit(imf);
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 
 	if (is_final && imf)
 		ip6_mfilter_free(imf);
 
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Select the interface for transmitting IPv6 multicast datagrams.
  *
  * Either an instance of struct in6_addr or an instance of struct ipv6_mreqn
  * may be passed to this socket option. An address of in6addr_any or an
  * interface index of 0 is used to remove a previous selection.
  * When no interface is selected, one is chosen for every send.
  */
 static int
 in6p_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct epoch_tracker	 et;
 	struct ifnet		*ifp;
 	struct ip6_moptions	*imo;
 	u_int			 ifindex;
 	int			 error;
 
 	if (sopt->sopt_valsize != sizeof(u_int))
 		return (EINVAL);
 
 	error = sooptcopyin(sopt, &ifindex, sizeof(u_int), sizeof(u_int));
 	if (error)
 		return (error);
 	NET_EPOCH_ENTER(et);
 	if (ifindex == 0)
 		ifp = NULL;
 	else {
 		ifp = ifnet_byindex(ifindex);
 		if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
 			NET_EPOCH_EXIT(et);
 			return (EADDRNOTAVAIL);
 		}
 	}
 	NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 	imo = in6p_findmoptions(inp);
 	imo->im6o_multicast_ifp = ifp;
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 
 /*
  * Atomically set source filters on a socket for an IPv6 multicast group.
  *
  * XXXGL: unsafely exits epoch with ifnet pointer
  */
 static int
 in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct __msfilterreq	 msfr;
 	struct epoch_tracker	 et;
 	sockunion_t		*gsa;
 	struct ifnet		*ifp;
 	struct in6_mfilter	*imf;
 	struct ip6_moptions	*imo;
 	struct in6_multi		*inm;
 	int			 error;
 
 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
 	    sizeof(struct __msfilterreq));
 	if (error)
 		return (error);
 
 	if (msfr.msfr_nsrcs > in6_mcast_maxsocksrc)
 		return (ENOBUFS);
 
 	if (msfr.msfr_fmode != MCAST_EXCLUDE &&
 	    msfr.msfr_fmode != MCAST_INCLUDE)
 		return (EINVAL);
 
 	if (msfr.msfr_group.ss_family != AF_INET6 ||
 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in6))
 		return (EINVAL);
 
 	gsa = (sockunion_t *)&msfr.msfr_group;
 	if (!IN6_IS_ADDR_MULTICAST(&gsa->sin6.sin6_addr))
 		return (EINVAL);
 
 	gsa->sin6.sin6_port = 0;	/* ignore port */
 
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(msfr.msfr_ifindex);
 	NET_EPOCH_EXIT(et);
 	if (ifp == NULL)
 		return (EADDRNOTAVAIL);
 	(void)in6_setscope(&gsa->sin6.sin6_addr, ifp, NULL);
 
 	/*
 	 * Take the INP write lock.
 	 * Check if this socket is a member of this group.
 	 */
 	imo = in6p_findmoptions(inp);
 	imf = im6o_match_group(imo, ifp, &gsa->sa);
 	if (imf == NULL) {
 		error = EADDRNOTAVAIL;
 		goto out_in6p_locked;
 	}
 	inm = imf->im6f_in6m;
 
 	/*
 	 * Begin state merge transaction at socket layer.
 	 */
 	INP_WLOCK_ASSERT(inp);
 
 	imf->im6f_st[1] = msfr.msfr_fmode;
 
 	/*
 	 * Apply any new source filters, if present.
 	 * Make a copy of the user-space source vector so
 	 * that we may copy them with a single copyin. This
 	 * allows us to deal with page faults up-front.
 	 */
 	if (msfr.msfr_nsrcs > 0) {
 		struct in6_msource	*lims;
 		struct sockaddr_in6	*psin;
 		struct sockaddr_storage	*kss, *pkss;
 		int			 i;
 
 		INP_WUNLOCK(inp);
 
 		CTR2(KTR_MLD, "%s: loading %lu source list entries",
 		    __func__, (unsigned long)msfr.msfr_nsrcs);
 		kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
 		    M_TEMP, M_WAITOK);
 		error = copyin(msfr.msfr_srcs, kss,
 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
 		if (error) {
 			free(kss, M_TEMP);
 			return (error);
 		}
 
 		INP_WLOCK(inp);
 
 		/*
 		 * Mark all source filters as UNDEFINED at t1.
 		 * Restore new group filter mode, as im6f_leave()
 		 * will set it to INCLUDE.
 		 */
 		im6f_leave(imf);
 		imf->im6f_st[1] = msfr.msfr_fmode;
 
 		/*
 		 * Update socket layer filters at t1, lazy-allocating
 		 * new entries. This saves a bunch of memory at the
 		 * cost of one RB_FIND() per source entry; duplicate
 		 * entries in the msfr_nsrcs vector are ignored.
 		 * If we encounter an error, rollback transaction.
 		 *
 		 * XXX This too could be replaced with a set-symmetric
 		 * difference like loop to avoid walking from root
 		 * every time, as the key space is common.
 		 */
 		for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
 			psin = (struct sockaddr_in6 *)pkss;
 			if (psin->sin6_family != AF_INET6) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 			if (psin->sin6_len != sizeof(struct sockaddr_in6)) {
 				error = EINVAL;
 				break;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&psin->sin6_addr)) {
 				error = EINVAL;
 				break;
 			}
 			/*
 			 * TODO: Validate embedded scope ID in source
 			 * list entry against passed-in ifp, if and only
 			 * if source list filter entry is iface or node local.
 			 */
 			in6_clearscope(&psin->sin6_addr);
 			error = im6f_get_source(imf, psin, &lims);
 			if (error)
 				break;
 			lims->im6sl_st[1] = imf->im6f_st[1];
 		}
 		free(kss, M_TEMP);
 	}
 
 	if (error)
 		goto out_im6f_rollback;
 
 	INP_WLOCK_ASSERT(inp);
 	IN6_MULTI_LIST_LOCK();
 
 	/*
 	 * Begin state merge transaction at MLD layer.
 	 */
 	CTR1(KTR_MLD, "%s: merge inm state", __func__);
 	error = in6m_merge(inm, imf);
 	if (error)
 		CTR1(KTR_MLD, "%s: failed to merge inm state", __func__);
 	else {
 		CTR1(KTR_MLD, "%s: doing mld downcall", __func__);
 		error = mld_change_state(inm, 0);
 		if (error)
 			CTR1(KTR_MLD, "%s: failed mld downcall", __func__);
 	}
 
 	IN6_MULTI_LIST_UNLOCK();
 
 out_im6f_rollback:
 	if (error)
 		im6f_rollback(imf);
 	else
 		im6f_commit(imf);
 
 	im6f_reap(imf);
 
 out_in6p_locked:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Set the IP multicast options in response to user setsockopt().
  *
  * Many of the socket options handled in this function duplicate the
  * functionality of socket options in the regular unicast API. However,
  * it is not possible to merge the duplicate code, because the idempotence
  * of the IPv6 multicast part of the BSD Sockets API must be preserved;
  * the effects of these options must be treated as separate and distinct.
  *
  * SMPng: XXX: Unlocked read of inp_socket believed OK.
  */
 int
 ip6_setmoptions(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct ip6_moptions	*im6o;
 	int			 error;
 
 	error = 0;
 
 	/* If socket is neither of type SOCK_RAW or SOCK_DGRAM, reject it. */
 	if (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
 	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)
 		return (EOPNOTSUPP);
 
 	switch (sopt->sopt_name) {
 	case IPV6_MULTICAST_IF:
 		error = in6p_set_multicast_if(inp, sopt);
 		break;
 
 	case IPV6_MULTICAST_HOPS: {
 		int hlim;
 
 		if (sopt->sopt_valsize != sizeof(int)) {
 			error = EINVAL;
 			break;
 		}
 		error = sooptcopyin(sopt, &hlim, sizeof(hlim), sizeof(int));
 		if (error)
 			break;
 		if (hlim < -1 || hlim > 255) {
 			error = EINVAL;
 			break;
 		} else if (hlim == -1) {
 			hlim = V_ip6_defmcasthlim;
 		}
 		im6o = in6p_findmoptions(inp);
 		im6o->im6o_multicast_hlim = hlim;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IPV6_MULTICAST_LOOP: {
 		u_int loop;
 
 		/*
 		 * Set the loopback flag for outgoing multicast packets.
 		 * Must be zero or one.
 		 */
 		if (sopt->sopt_valsize != sizeof(u_int)) {
 			error = EINVAL;
 			break;
 		}
 		error = sooptcopyin(sopt, &loop, sizeof(u_int), sizeof(u_int));
 		if (error)
 			break;
 		if (loop > 1) {
 			error = EINVAL;
 			break;
 		}
 		im6o = in6p_findmoptions(inp);
 		im6o->im6o_multicast_loop = loop;
 		INP_WUNLOCK(inp);
 		break;
 	}
 
 	case IPV6_JOIN_GROUP:
 	case MCAST_JOIN_GROUP:
 	case MCAST_JOIN_SOURCE_GROUP:
 		error = in6p_join_group(inp, sopt);
 		break;
 
 	case IPV6_LEAVE_GROUP:
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 		error = in6p_leave_group(inp, sopt);
 		break;
 
 	case MCAST_BLOCK_SOURCE:
 	case MCAST_UNBLOCK_SOURCE:
 		error = in6p_block_unblock_source(inp, sopt);
 		break;
 
 	case IPV6_MSFILTER:
 		error = in6p_set_source_filters(inp, sopt);
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	INP_UNLOCK_ASSERT(inp);
 
 	return (error);
 }
 
 /*
  * Expose MLD's multicast filter mode and source list(s) to userland,
  * keyed by (ifindex, group).
  * The filter mode is written out as a uint32_t, followed by
  * 0..n of struct in6_addr.
  * For use by ifmcstat(8).
  * SMPng: NOTE: unlocked read of ifindex space.
  */
 static int
 sysctl_ip6_mcast_filters(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_addr			 mcaddr;
 	struct in6_addr			 src;
 	struct epoch_tracker		 et;
 	struct ifnet			*ifp;
 	struct ifmultiaddr		*ifma;
 	struct in6_multi		*inm;
 	struct ip6_msource		*ims;
 	int				*name;
 	int				 retval;
 	u_int				 namelen;
 	uint32_t			 fmode, ifindex;
 #ifdef KTR
 	char				 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/* int: ifindex + 4 * 32 bits of IPv6 address */
 	if (namelen != 5)
 		return (EINVAL);
 
 	memcpy(&mcaddr, &name[1], sizeof(struct in6_addr));
 	if (!IN6_IS_ADDR_MULTICAST(&mcaddr)) {
 		CTR2(KTR_MLD, "%s: group %s is not multicast",
 		    __func__, ip6_sprintf(ip6tbuf, &mcaddr));
 		return (EINVAL);
 	}
 
 	ifindex = name[0];
 	NET_EPOCH_ENTER(et);
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		NET_EPOCH_EXIT(et);
 		CTR2(KTR_MLD, "%s: no ifp for ifindex %u",
 		    __func__, ifindex);
 		return (ENOENT);
 	}
 	/*
 	 * Internal MLD lookups require that scope/zone ID is set.
 	 */
 	(void)in6_setscope(&mcaddr, ifp, NULL);
 
 	retval = sysctl_wire_old_buffer(req,
 	    sizeof(uint32_t) + (in6_mcast_maxgrpsrc * sizeof(struct in6_addr)));
 	if (retval) {
 		NET_EPOCH_EXIT(et);
 		return (retval);
 	}
 
 	IN6_MULTI_LOCK();
 	IN6_MULTI_LIST_LOCK();
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = in6m_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		if (!IN6_ARE_ADDR_EQUAL(&inm->in6m_addr, &mcaddr))
 			continue;
 		fmode = inm->in6m_st[1].iss_fmode;
 		retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
 		if (retval != 0)
 			break;
 		RB_FOREACH(ims, ip6_msource_tree, &inm->in6m_srcs) {
 			CTR2(KTR_MLD, "%s: visit node %p", __func__, ims);
 			/*
 			 * Only copy-out sources which are in-mode.
 			 */
 			if (fmode != im6s_get_mode(inm, ims, 1)) {
 				CTR1(KTR_MLD, "%s: skip non-in-mode",
 				    __func__);
 				continue;
 			}
 			src = ims->im6s_addr;
 			retval = SYSCTL_OUT(req, &src,
 			    sizeof(struct in6_addr));
 			if (retval != 0)
 				break;
 		}
 	}
 	IN6_MULTI_LIST_UNLOCK();
 	IN6_MULTI_UNLOCK();
 	NET_EPOCH_EXIT(et);
 
 	return (retval);
 }
 
 #ifdef KTR
 
 static const char *in6m_modestrs[] = { "un", "in", "ex" };
 
 static const char *
 in6m_mode_str(const int mode)
 {
 
 	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
 		return (in6m_modestrs[mode]);
 	return ("??");
 }
 
 static const char *in6m_statestrs[] = {
 	"not-member",
 	"silent",
 	"reporting",
 	"idle",
 	"lazy",
 	"sleeping",
 	"awakening",
 	"query-pending",
 	"sg-query-pending",
 	"leaving"
 };
 _Static_assert(nitems(in6m_statestrs) ==
     MLD_LEAVING_MEMBER - MLD_NOT_MEMBER + 1, "Missing MLD group state");
 
 static const char *
 in6m_state_str(const int state)
 {
 
 	if (state >= MLD_NOT_MEMBER && state <= MLD_LEAVING_MEMBER)
 		return (in6m_statestrs[state]);
 	return ("??");
 }
 
 /*
  * Dump an in6_multi structure to the console.
  */
 void
 in6m_print(const struct in6_multi *inm)
 {
 	int t;
 	char ip6tbuf[INET6_ADDRSTRLEN];
 
 	if ((ktr_mask & KTR_MLD) == 0)
 		return;
 
 	printf("%s: --- begin in6m %p ---\n", __func__, inm);
 	printf("addr %s ifp %p(%s) ifma %p\n",
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp,
 	    if_name(inm->in6m_ifp),
 	    inm->in6m_ifma);
 	printf("timer %u state %s refcount %u scq.len %u\n",
 	    inm->in6m_timer,
 	    in6m_state_str(inm->in6m_state),
 	    inm->in6m_refcount,
 	    mbufq_len(&inm->in6m_scq));
 	printf("mli %p nsrc %lu sctimer %u scrv %u\n",
 	    inm->in6m_mli,
 	    inm->in6m_nsrc,
 	    inm->in6m_sctimer,
 	    inm->in6m_scrv);
 	for (t = 0; t < 2; t++) {
 		printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
 		    in6m_mode_str(inm->in6m_st[t].iss_fmode),
 		    inm->in6m_st[t].iss_asm,
 		    inm->in6m_st[t].iss_ex,
 		    inm->in6m_st[t].iss_in,
 		    inm->in6m_st[t].iss_rec);
 	}
 	printf("%s: --- end in6m %p ---\n", __func__, inm);
 }
 
 #else /* !KTR */
 
 void
 in6m_print(const struct in6_multi *inm)
 {
 
 }
 
 #endif /* KTR */
diff --git a/sys/netinet6/in6_rmx.c b/sys/netinet6/in6_rmx.c
index 5cd401611db9..7a1ca781d712 100644
--- a/sys/netinet6/in6_rmx.c
+++ b/sys/netinet6/in6_rmx.c
@@ -1,171 +1,172 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_rmx.c,v 1.11 2001/07/26 06:53:16 jinmei Exp $
  */
 
 /*-
  * Copyright 1994, 1995 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/mbuf.h>
 #include <sys/rwlock.h>
 #include <sys/syslog.h>
 #include <sys/callout.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/route/nhop.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_var.h>
 #include <netinet/in_var.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 
 static int
 rib6_set_nh_pfxflags(u_int fibnum, const struct sockaddr *addr, const struct sockaddr *mask,
     struct nhop_object *nh)
 {
 	const struct sockaddr_in6 *mask6 = (const struct sockaddr_in6 *)mask;
 
 	if (mask6 == NULL)
 		nhop_set_pxtype_flag(nh, NHF_HOST);
 	else if (IN6_IS_ADDR_UNSPECIFIED(&mask6->sin6_addr))
 		nhop_set_pxtype_flag(nh, NHF_DEFAULT);
 	else
 		nhop_set_pxtype_flag(nh, 0);
 
 	return (0);
 }
 
 static int
 rib6_augment_nh(u_int fibnum, struct nhop_object *nh)
 {
 	/*
 	 * Check route MTU:
 	 * inherit interface MTU if not set or
 	 * check if MTU is too large.
 	 */
 	if (nh->nh_mtu == 0) {
 		nh->nh_mtu = IN6_LINKMTU(nh->nh_ifp);
 	} else if (nh->nh_mtu > IN6_LINKMTU(nh->nh_ifp))
 		nh->nh_mtu = IN6_LINKMTU(nh->nh_ifp);
 
 	/* Set nexthop type */
 	if (nhop_get_type(nh) == 0) {
 		uint16_t nh_type;
 		if (nh->nh_flags & NHF_GATEWAY)
 			nh_type = NH_TYPE_IPV6_ETHER_NHOP;
 		else
 			nh_type = NH_TYPE_IPV6_ETHER_RSLV;
 
 		nhop_set_type(nh, nh_type);
 	}
 
 	return (0);
 }
 
 /*
  * Initialize our routing tree.
  */
 
 struct rib_head *
 in6_inithead(uint32_t fibnum)
 {
 	struct rib_head *rh;
 	struct rib_subscription *rs __diagused;
 
 	rh = rt_table_init(offsetof(struct sockaddr_in6, sin6_addr) << 3,
 	    AF_INET6, fibnum);
 	if (rh == NULL)
 		return (NULL);
 
 	rh->rnh_set_nh_pfxflags = rib6_set_nh_pfxflags;
 	rh->rnh_augment_nh = rib6_augment_nh;
 
 	rs = rib_subscribe_internal(rh, nd6_subscription_cb, NULL,
 	    RIB_NOTIFY_IMMEDIATE, true);
 	KASSERT(rs != NULL, ("Unable to subscribe to fib %u\n", fibnum));
 
 	return (rh);
 }
 
 #ifdef VIMAGE
 void
 in6_detachhead(struct rib_head *rh)
 {
 
 	rt_table_destroy(rh);
 }
 #endif
diff --git a/sys/netinet6/in6_src.c b/sys/netinet6/in6_src.c
index 7188282013cc..373f80b7ac19 100644
--- a/sys/netinet6/in6_src.c
+++ b/sys/netinet6/in6_src.c
@@ -1,1118 +1,1119 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_src.c,v 1.132 2003/08/26 04:42:27 keiichi Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_pcb.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/rmlock.h>
 #include <sys/sx.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/if_llatbl.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 
 static struct mtx addrsel_lock;
 #define	ADDRSEL_LOCK_INIT()	mtx_init(&addrsel_lock, "addrsel_lock", NULL, MTX_DEF)
 #define	ADDRSEL_LOCK()		mtx_lock(&addrsel_lock)
 #define	ADDRSEL_UNLOCK()	mtx_unlock(&addrsel_lock)
 #define	ADDRSEL_LOCK_ASSERT()	mtx_assert(&addrsel_lock, MA_OWNED)
 
 static struct sx addrsel_sxlock;
 #define	ADDRSEL_SXLOCK_INIT()	sx_init(&addrsel_sxlock, "addrsel_sxlock")
 #define	ADDRSEL_SLOCK()		sx_slock(&addrsel_sxlock)
 #define	ADDRSEL_SUNLOCK()	sx_sunlock(&addrsel_sxlock)
 #define	ADDRSEL_XLOCK()		sx_xlock(&addrsel_sxlock)
 #define	ADDRSEL_XUNLOCK()	sx_xunlock(&addrsel_sxlock)
 
 #define ADDR_LABEL_NOTAPP (-1)
 VNET_DEFINE_STATIC(struct in6_addrpolicy, defaultaddrpolicy);
 #define	V_defaultaddrpolicy		VNET(defaultaddrpolicy)
 
 VNET_DEFINE(int, ip6_prefer_tempaddr) = 0;
 
 static int selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
 	struct ip6_moptions *, struct route_in6 *, struct ifnet **,
 	struct nhop_object **, int, u_int, uint32_t);
 static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *,
 	struct ip6_moptions *, struct ifnet **,
 	struct ifnet *, u_int);
 static int in6_selectsrc(uint32_t, struct sockaddr_in6 *,
 	struct ip6_pktopts *, struct inpcb *, struct ucred *,
 	struct ifnet **, struct in6_addr *);
 
 static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *);
 
 static void init_policy_queue(void);
 static int add_addrsel_policyent(struct in6_addrpolicy *);
 static int delete_addrsel_policyent(struct in6_addrpolicy *);
 static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *),
 	void *);
 static int dump_addrsel_policyent(struct in6_addrpolicy *, void *);
 static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *);
 
 /*
  * Return an IPv6 address, which is the most appropriate for a given
  * destination and user specified options.
  * If necessary, this function lookups the routing table and returns
  * an entry to the caller for later use.
  */
 #define REPLACE(r) do {\
 	IP6STAT_INC(ip6s_sources_rule[(r)]); \
 	/* { \
 	char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
 	printf("in6_selectsrc: replace %s with %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
 	} */ \
 	goto replace; \
 } while(0)
 #define NEXT(r) do {\
 	/* { \
 	char ip6buf[INET6_ADDRSTRLEN], ip6b[INET6_ADDRSTRLEN]; \
 	printf("in6_selectsrc: keep %s against %s by %d\n", ia_best ? ip6_sprintf(ip6buf, &ia_best->ia_addr.sin6_addr) : "none", ip6_sprintf(ip6b, &ia->ia_addr.sin6_addr), (r)); \
 	} */ \
 	goto next;		/* XXX: we can't use 'continue' here */ \
 } while(0)
 #define BREAK(r) do { \
 	IP6STAT_INC(ip6s_sources_rule[(r)]); \
 	goto out;		/* XXX: we can't use 'break' here */ \
 } while(0)
 
 static int
 in6_selectsrc(uint32_t fibnum, struct sockaddr_in6 *dstsock,
     struct ip6_pktopts *opts, struct inpcb *inp, struct ucred *cred,
     struct ifnet **ifpp, struct in6_addr *srcp)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_addr dst, tmp;
 	struct ifnet *ifp = NULL, *oifp = NULL;
 	struct in6_ifaddr *ia = NULL, *ia_best = NULL;
 	struct in6_pktinfo *pi = NULL;
 	int dst_scope = -1, best_scope = -1, best_matchlen = -1;
 	struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL;
 	u_int32_t odstzone;
 	int prefer_tempaddr;
 	int error;
 	struct ip6_moptions *mopts;
 
 	NET_EPOCH_ASSERT();
 	KASSERT(srcp != NULL, ("%s: srcp is NULL", __func__));
 
 	dst = dstsock->sin6_addr; /* make a copy for local operation */
 	if (ifpp) {
 		/*
 		 * Save a possibly passed in ifp for in6_selectsrc. Only
 		 * neighbor discovery code should use this feature, where
 		 * we may know the interface but not the FIB number holding
 		 * the connected subnet in case someone deleted it from the
 		 * default FIB and we need to check the interface.
 		 */
 		if (*ifpp != NULL)
 			oifp = *ifpp;
 		*ifpp = NULL;
 	}
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		mopts = inp->in6p_moptions;
 	} else {
 		mopts = NULL;
 	}
 
 	/*
 	 * If the source address is explicitly specified by the caller,
 	 * check if the requested source address is indeed a unicast address
 	 * assigned to the node, and can be used as the packet's source
 	 * address.  If everything is okay, use the address as source.
 	 */
 	if (opts && (pi = opts->ip6po_pktinfo) &&
 	    !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
 		/* get the outgoing interface */
 		if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp,
 		    fibnum))
 		    != 0)
 			return (error);
 
 		/*
 		 * determine the appropriate zone id of the source based on
 		 * the zone of the destination and the outgoing interface.
 		 * If the specified address is ambiguous wrt the scope zone,
 		 * the interface must be specified; otherwise, ifa_ifwithaddr()
 		 * will fail matching the address.
 		 */
 		tmp = pi->ipi6_addr;
 		if (ifp) {
 			error = in6_setscope(&tmp, ifp, &odstzone);
 			if (error)
 				return (error);
 		}
 		if (cred != NULL && (error = prison_local_ip6(cred,
 		    &tmp, (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0)
 			return (error);
 
 		/*
 		 * If IPV6_BINDANY socket option is set, we allow to specify
 		 * non local addresses as source address in IPV6_PKTINFO
 		 * ancillary data.
 		 */
 		if ((inp->inp_flags & INP_BINDANY) == 0) {
 			ia = in6ifa_ifwithaddr(&tmp, 0 /* XXX */, false);
 			if (ia == NULL || (ia->ia6_flags & (IN6_IFF_ANYCAST |
 			    IN6_IFF_NOTREADY)))
 				return (EADDRNOTAVAIL);
 			bcopy(&ia->ia_addr.sin6_addr, srcp, sizeof(*srcp));
 		} else
 			bcopy(&tmp, srcp, sizeof(*srcp));
 		pi->ipi6_addr = tmp; /* XXX: this overrides pi */
 		if (ifpp)
 			*ifpp = ifp;
 		return (0);
 	}
 
 	/*
 	 * Otherwise, if the socket has already bound the source, just use it.
 	 */
 	if (inp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 		if (cred != NULL &&
 		    (error = prison_local_ip6(cred, &inp->in6p_laddr,
 		    ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0))) != 0)
 			return (error);
 		bcopy(&inp->in6p_laddr, srcp, sizeof(*srcp));
 		return (0);
 	}
 
 	/*
 	 * Bypass source address selection and use the primary jail IP
 	 * if requested.
 	 */
 	if (cred != NULL && !prison_saddrsel_ip6(cred, srcp))
 		return (0);
 
 	/*
 	 * If the address is not specified, choose the best one based on
 	 * the outgoing interface and the destination address.
 	 */
 	/* get the outgoing interface */
 	if ((error = in6_selectif(dstsock, opts, mopts, &ifp, oifp,
 	    (inp != NULL) ? inp->inp_inc.inc_fibnum : fibnum)) != 0)
 		return (error);
 
 #ifdef DIAGNOSTIC
 	if (ifp == NULL)	/* this should not happen */
 		panic("in6_selectsrc: NULL ifp");
 #endif
 	error = in6_setscope(&dst, ifp, &odstzone);
 	if (error)
 		return (error);
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		int new_scope = -1, new_matchlen = -1;
 		struct in6_addrpolicy *new_policy = NULL;
 		u_int32_t srczone, osrczone, dstzone;
 		struct in6_addr src;
 		struct ifnet *ifp1 = ia->ia_ifp;
 
 		/*
 		 * We'll never take an address that breaks the scope zone
 		 * of the destination.  We also skip an address if its zone
 		 * does not contain the outgoing interface.
 		 * XXX: we should probably use sin6_scope_id here.
 		 */
 		if (in6_setscope(&dst, ifp1, &dstzone) ||
 		    odstzone != dstzone) {
 			continue;
 		}
 		src = ia->ia_addr.sin6_addr;
 		if (in6_setscope(&src, ifp, &osrczone) ||
 		    in6_setscope(&src, ifp1, &srczone) ||
 		    osrczone != srczone) {
 			continue;
 		}
 
 		/* avoid unusable addresses */
 		if ((ia->ia6_flags &
 		     (IN6_IFF_NOTREADY | IN6_IFF_ANYCAST | IN6_IFF_DETACHED))) {
 				continue;
 		}
 		if (!V_ip6_use_deprecated && IFA6_IS_DEPRECATED(ia))
 			continue;
 
 		/* If jailed only take addresses of the jail into account. */
 		if (cred != NULL &&
 		    prison_check_ip6(cred, &ia->ia_addr.sin6_addr) != 0)
 			continue;
 
 		/* Rule 1: Prefer same address */
 		if (IN6_ARE_ADDR_EQUAL(&dst, &ia->ia_addr.sin6_addr)) {
 			ia_best = ia;
 			BREAK(1); /* there should be no better candidate */
 		}
 
 		if (ia_best == NULL)
 			REPLACE(0);
 
 		/* Rule 2: Prefer appropriate scope */
 		if (dst_scope < 0)
 			dst_scope = in6_addrscope(&dst);
 		new_scope = in6_addrscope(&ia->ia_addr.sin6_addr);
 		if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) {
 			if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0)
 				REPLACE(2);
 			NEXT(2);
 		} else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) {
 			if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0)
 				NEXT(2);
 			REPLACE(2);
 		}
 
 		/*
 		 * Rule 3: Avoid deprecated addresses.  Note that the case of
 		 * !ip6_use_deprecated is already rejected above.
 		 */
 		if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia))
 			NEXT(3);
 		if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia))
 			REPLACE(3);
 
 		/* Rule 4: Prefer home addresses */
 		/*
 		 * XXX: This is a TODO.  We should probably merge the MIP6
 		 * case above.
 		 */
 
 		/* Rule 5: Prefer outgoing interface */
 		if (!(ND_IFINFO(ifp)->flags & ND6_IFF_NO_PREFER_IFACE)) {
 			if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp)
 				NEXT(5);
 			if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp)
 				REPLACE(5);
 		}
 
 		/*
 		 * Rule 6: Prefer matching label
 		 * Note that best_policy should be non-NULL here.
 		 */
 		if (dst_policy == NULL)
 			dst_policy = lookup_addrsel_policy(dstsock);
 		if (dst_policy->label != ADDR_LABEL_NOTAPP) {
 			new_policy = lookup_addrsel_policy(&ia->ia_addr);
 			if (dst_policy->label == best_policy->label &&
 			    dst_policy->label != new_policy->label)
 				NEXT(6);
 			if (dst_policy->label != best_policy->label &&
 			    dst_policy->label == new_policy->label)
 				REPLACE(6);
 		}
 
 		/*
 		 * Rule 7: Prefer public addresses.
 		 * We allow users to reverse the logic by configuring
 		 * a sysctl variable, so that privacy conscious users can
 		 * always prefer temporary addresses.
 		 */
 		if (opts == NULL ||
 		    opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) {
 			prefer_tempaddr = V_ip6_prefer_tempaddr;
 		} else if (opts->ip6po_prefer_tempaddr ==
 		    IP6PO_TEMPADDR_NOTPREFER) {
 			prefer_tempaddr = 0;
 		} else
 			prefer_tempaddr = 1;
 		if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
 		    (ia->ia6_flags & IN6_IFF_TEMPORARY)) {
 			if (prefer_tempaddr)
 				REPLACE(7);
 			else
 				NEXT(7);
 		}
 		if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
 		    !(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
 			if (prefer_tempaddr)
 				NEXT(7);
 			else
 				REPLACE(7);
 		}
 
 		/*
 		 * Rule 8: prefer addresses on alive interfaces.
 		 * This is a KAME specific rule.
 		 */
 		if ((ia_best->ia_ifp->if_flags & IFF_UP) &&
 		    !(ia->ia_ifp->if_flags & IFF_UP))
 			NEXT(8);
 		if (!(ia_best->ia_ifp->if_flags & IFF_UP) &&
 		    (ia->ia_ifp->if_flags & IFF_UP))
 			REPLACE(8);
 
 		/*
 		 * Rule 9: prefer address with better virtual status.
 		 */
 		if (ifa_preferred(&ia_best->ia_ifa, &ia->ia_ifa))
 			REPLACE(9);
 		if (ifa_preferred(&ia->ia_ifa, &ia_best->ia_ifa))
 			NEXT(9);
 
 		/*
 		 * Rule 10: prefer address with `prefer_source' flag.
 		 */
 		if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0 &&
 		    (ia->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0)
 			REPLACE(10);
 		if ((ia_best->ia6_flags & IN6_IFF_PREFER_SOURCE) != 0 &&
 		    (ia->ia6_flags & IN6_IFF_PREFER_SOURCE) == 0)
 			NEXT(10);
 
 		/*
 		 * Rule 14: Use longest matching prefix.
 		 * Note: in the address selection draft, this rule is
 		 * documented as "Rule 8".  However, since it is also
 		 * documented that this rule can be overridden, we assign
 		 * a large number so that it is easy to assign smaller numbers
 		 * to more preferred rules.
 		 */
 		new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, &dst);
 		if (best_matchlen < new_matchlen)
 			REPLACE(14);
 		if (new_matchlen < best_matchlen)
 			NEXT(14);
 
 		/* Rule 15 is reserved. */
 
 		/*
 		 * Last resort: just keep the current candidate.
 		 * Or, do we need more rules?
 		 */
 		continue;
 
 	  replace:
 		ia_best = ia;
 		best_scope = (new_scope >= 0 ? new_scope :
 			      in6_addrscope(&ia_best->ia_addr.sin6_addr));
 		best_policy = (new_policy ? new_policy :
 			       lookup_addrsel_policy(&ia_best->ia_addr));
 		best_matchlen = (new_matchlen >= 0 ? new_matchlen :
 				 in6_matchlen(&ia_best->ia_addr.sin6_addr,
 					      &dst));
 
 	  next:
 		continue;
 
 	  out:
 		break;
 	}
 
 	if ((ia = ia_best) == NULL) {
 		IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 		IP6STAT_INC(ip6s_sources_none);
 		return (EADDRNOTAVAIL);
 	}
 
 	/*
 	 * At this point at least one of the addresses belonged to the jail
 	 * but it could still be, that we want to further restrict it, e.g.
 	 * theoratically IN6_IS_ADDR_LOOPBACK.
 	 * It must not be IN6_IS_ADDR_UNSPECIFIED anymore.
 	 * prison_local_ip6() will fix an IN6_IS_ADDR_LOOPBACK but should
 	 * let all others previously selected pass.
 	 * Use tmp to not change ::1 on lo0 to the primary jail address.
 	 */
 	tmp = ia->ia_addr.sin6_addr;
 	if (cred != NULL && prison_local_ip6(cred, &tmp, (inp != NULL &&
 	    (inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)) != 0) {
 		IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 		IP6STAT_INC(ip6s_sources_none);
 		return (EADDRNOTAVAIL);
 	}
 
 	if (ifpp)
 		*ifpp = ifp;
 
 	bcopy(&tmp, srcp, sizeof(*srcp));
 	if (ia->ia_ifp == ifp)
 		IP6STAT_INC(ip6s_sources_sameif[best_scope]);
 	else
 		IP6STAT_INC(ip6s_sources_otherif[best_scope]);
 	if (dst_scope == best_scope)
 		IP6STAT_INC(ip6s_sources_samescope[best_scope]);
 	else
 		IP6STAT_INC(ip6s_sources_otherscope[best_scope]);
 	if (IFA6_IS_DEPRECATED(ia))
 		IP6STAT_INC(ip6s_sources_deprecated[best_scope]);
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (0);
 }
 
 /*
  * Select source address based on @inp, @dstsock and @opts.
  * Stores selected address to @srcp. If @scope_ambiguous is set,
  * embed scope from selected outgoing interface. If @hlim pointer
  * is provided, stores calculated hop limit there.
  * Returns 0 on success.
  */
 int
 in6_selectsrc_socket(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct inpcb *inp, struct ucred *cred, int scope_ambiguous,
     struct in6_addr *srcp, int *hlim)
 {
 	struct ifnet *retifp;
 	uint32_t fibnum;
 	int error;
 
 	fibnum = inp->inp_inc.inc_fibnum;
 	retifp = NULL;
 
 	error = in6_selectsrc(fibnum, dstsock, opts, inp, cred, &retifp, srcp);
 	if (error != 0)
 		return (error);
 
 	if (hlim != NULL)
 		*hlim = in6_selecthlim(inp, retifp);
 
 	if (retifp == NULL || scope_ambiguous == 0)
 		return (0);
 
 	/*
 	 * Application should provide a proper zone ID or the use of
 	 * default zone IDs should be enabled.  Unfortunately, some
 	 * applications do not behave as it should, so we need a
 	 * workaround.  Even if an appropriate ID is not determined
 	 * (when it's required), if we can determine the outgoing
 	 * interface. determine the zone ID based on the interface.
 	 */
 	error = in6_setscope(&dstsock->sin6_addr, retifp, NULL);
 
 	return (error);
 }
 
 /*
  * Select source address based on @fibnum, @dst and @scopeid.
  * Stores selected address to @srcp.
  * Returns 0 on success.
  *
  * Used by non-socket based consumers (ND code mostly)
  */
 int
 in6_selectsrc_addr(uint32_t fibnum, const struct in6_addr *dst,
     uint32_t scopeid, struct ifnet *ifp, struct in6_addr *srcp,
     int *hlim)
 {
 	struct ifnet *retifp;
 	struct sockaddr_in6 dst_sa;
 	int error;
 
 	retifp = ifp;
 	bzero(&dst_sa, sizeof(dst_sa));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(dst_sa);
 	dst_sa.sin6_addr = *dst;
 	dst_sa.sin6_scope_id = scopeid;
 	sa6_embedscope(&dst_sa, 0);
 
 	error = in6_selectsrc(fibnum, &dst_sa, NULL, NULL, NULL, &retifp, srcp);
 	if (hlim != NULL)
 		*hlim = in6_selecthlim(NULL, retifp);
 
 	return (error);
 }
 
 static struct nhop_object *
 cache_route(uint32_t fibnum, const struct sockaddr_in6 *dst, struct route_in6 *ro,
     uint32_t flowid)
 {
 	/*
 	 * Use a cached route if it exists and is valid, else try to allocate
 	 * a new one. Note that we should check the address family of the
 	 * cached destination, in case of sharing the cache with IPv4.
 	 * Assumes that 'struct route_in6' is exclusively locked.
 	 */
 	if (ro->ro_nh != NULL && (
 	    !NH_IS_VALID(ro->ro_nh) || ro->ro_dst.sin6_family != AF_INET6 ||
 	    !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &dst->sin6_addr)))
 		RO_NHFREE(ro);
 
 	if (ro->ro_nh == NULL) {
 		ro->ro_dst = *dst;
 
 		const struct in6_addr *paddr;
 		struct in6_addr unscoped_addr;
 		uint32_t scopeid = 0;
 		if (IN6_IS_SCOPE_LINKLOCAL(&dst->sin6_addr)) {
 			in6_splitscope(&dst->sin6_addr, &unscoped_addr, &scopeid);
 			paddr = &unscoped_addr;
 		} else
 			paddr = &dst->sin6_addr;
 		ro->ro_nh = fib6_lookup(fibnum, paddr, scopeid, NHR_REF, flowid);
 	}
 	return (ro->ro_nh);
 }
 
 static struct nhop_object *
 lookup_route(uint32_t fibnum, struct sockaddr_in6 *dst, struct route_in6 *ro,
     struct ip6_pktopts *opts, uint32_t flowid)
 {
 	struct nhop_object *nh = NULL;
 
 	/*
 	 * If the next hop address for the packet is specified by the caller,
 	 * use it as the gateway.
 	 */
 	if (opts && opts->ip6po_nexthop) {
 		struct route_in6 *ron = &opts->ip6po_nextroute;
 		struct sockaddr_in6 *sin6_next = satosin6(opts->ip6po_nexthop);
 
 		nh = cache_route(fibnum, sin6_next, ron, flowid);
 
 		/*
 		 * The node identified by that address must be a
 		 * neighbor of the sending host.
 		 */
 		if (nh != NULL && (nh->nh_flags & NHF_GATEWAY) != 0)
 			nh = NULL;
 	} else if (ro != NULL) {
 		nh = cache_route(fibnum, dst, ro, flowid);
 		if (nh == NULL)
 			return (NULL);
 
 		/*
 		 * Check if the outgoing interface conflicts with
 		 * the interface specified by ipi6_ifindex (if specified).
 		 */
 		struct in6_pktinfo *pi;
 		if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
 			if (nh->nh_aifp->if_index != pi->ipi6_ifindex)
 				nh = NULL;
 		}
 	}
 
 	return (nh);
 }
 
 /*
  * Finds outgoing nexthop or the outgoing interface for the
  * @dstsock.
  * Return 0 on success and stores the lookup result in @retnh and @retifp
  */
 static int
 selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct ip6_moptions *mopts, struct route_in6 *ro,
     struct ifnet **retifp, struct nhop_object **retnh, int norouteok,
     u_int fibnum, uint32_t flowid)
 {
 	int error = 0;
 	struct ifnet *ifp = NULL;
 	struct in6_pktinfo *pi = NULL;
 	struct in6_addr *dst = &dstsock->sin6_addr;
 
 	/* If the caller specify the outgoing interface explicitly, use it. */
 	if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
 		/* XXX boundary check is assumed to be already done. */
 		ifp = ifnet_byindex(pi->ipi6_ifindex);
 		if (ifp != NULL && (norouteok || IN6_IS_ADDR_MULTICAST(dst))) {
 			/*
 			 * we do not have to check or get the route for
 			 * multicast.
 			 */
 			goto done;
 		} else
 			goto getroute;
 	}
 	/*
 	 * If the destination address is a multicast address and the outgoing
 	 * interface for the address is specified by the caller, use it.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(dst) &&
 	    mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) {
 		goto done; /* we do not need a route for multicast. */
 	}
 	/*
 	 * If destination address is LLA or link- or node-local multicast,
 	 * use it's embedded scope zone id to determine outgoing interface.
 	 */
 	if (IN6_IS_ADDR_MC_LINKLOCAL(dst) ||
 	    IN6_IS_ADDR_MC_NODELOCAL(dst)) {
 		uint32_t zoneid = ntohs(in6_getscope(dst));
 		if (zoneid > 0) {
 			ifp = in6_getlinkifnet(zoneid);
 			goto done;
 		}
 	}
 
   getroute:;
 	struct nhop_object *nh = lookup_route(fibnum, dstsock, ro, opts, flowid);
 	if (nh != NULL) {
 		*retifp = nh->nh_aifp;
 		error = 0;
 	} else {
 		*retifp = NULL;
 		IP6STAT_INC(ip6s_noroute);
 		error = EHOSTUNREACH;
 	}
 	*retnh = nh;
 	return (error);
 
   done:
 	if (ifp == NULL) {
 		/*
 		 * This can happen if the caller did not pass a cached route
 		 * nor any other hints.  We treat this case an error.
 		 */
 		error = EHOSTUNREACH;
 	}
 	if (error == EHOSTUNREACH)
 		IP6STAT_INC(ip6s_noroute);
 
 	*retifp = ifp;
 	*retnh = NULL;
 
 	return (error);
 }
 
 static int
 in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct ip6_moptions *mopts, struct ifnet **retifp,
     struct ifnet *oifp, u_int fibnum)
 {
 	int error;
 	struct route_in6 sro;
 	struct nhop_object *nh = NULL;
 	uint16_t nh_flags;
 
 	KASSERT(retifp != NULL, ("%s: retifp is NULL", __func__));
 
 	bzero(&sro, sizeof(sro));
 	nh_flags = 0;
 
 	error = selectroute(dstsock, opts, mopts, &sro, retifp, &nh, 1, fibnum, 0);
 
 	if (nh != NULL)
 		nh_flags = nh->nh_flags;
 	if (nh != NULL && nh == sro.ro_nh)
 		NH_FREE(nh);
 
 	if (error != 0) {
 		/* Help ND. See oifp comment in in6_selectsrc(). */
 		if (oifp != NULL && fibnum == RT_DEFAULT_FIB) {
 			*retifp = oifp;
 			error = 0;
 		}
 		return (error);
 	}
 
 	/*
 	 * do not use a rejected or black hole route.
 	 * XXX: this check should be done in the L2 output routine.
 	 * However, if we skipped this check here, we'd see the following
 	 * scenario:
 	 * - install a rejected route for a scoped address prefix
 	 *   (like fe80::/10)
 	 * - send a packet to a destination that matches the scoped prefix,
 	 *   with ambiguity about the scope zone.
 	 * - pick the outgoing interface from the route, and disambiguate the
 	 *   scope zone with the interface.
 	 * - ip6_output() would try to get another route with the "new"
 	 *   destination, which may be valid.
 	 * - we'd see no error on output.
 	 * Although this may not be very harmful, it should still be confusing.
 	 * We thus reject the case here.
 	 */
 
 	if (nh_flags & (NHF_REJECT | NHF_BLACKHOLE)) {
 		error = (nh_flags & NHF_HOST ? EHOSTUNREACH : ENETUNREACH);
 		return (error);
 	}
 
 	return (0);
 }
 
 /* Public wrapper function to selectroute(). */
 int
 in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
     struct ip6_moptions *mopts, struct route_in6 *ro,
     struct ifnet **retifp, struct nhop_object **retnh, u_int fibnum, uint32_t flowid)
 {
 	MPASS(retifp != NULL);
 	MPASS(retnh != NULL);
 
 	return (selectroute(dstsock, opts, mopts, ro, retifp,
 	    retnh, 0, fibnum, flowid));
 }
 
 /*
  * Default hop limit selection. The precedence is as follows:
  * 1. Hoplimit value specified via ioctl.
  * 2. (If the outgoing interface is detected) the current
  *     hop limit of the interface specified by router advertisement.
  * 3. The system default hoplimit.
  */
 int
 in6_selecthlim(struct inpcb *inp, struct ifnet *ifp)
 {
 
 	if (inp && inp->in6p_hops >= 0)
 		return (inp->in6p_hops);
 	else if (ifp)
 		return (ND_IFINFO(ifp)->chlim);
 	else if (inp && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		struct nhop_object *nh;
 		struct in6_addr dst;
 		uint32_t fibnum, scopeid;
 		int hlim;
 
 		fibnum = inp->inp_inc.inc_fibnum;
 		in6_splitscope(&inp->in6p_faddr, &dst, &scopeid);
 		nh = fib6_lookup(fibnum, &dst, scopeid, 0, 0);
 		if (nh != NULL) {
 			hlim = ND_IFINFO(nh->nh_ifp)->chlim;
 			return (hlim);
 		}
 	}
 	return (V_ip6_defhlim);
 }
 
 void
 addrsel_policy_init(void)
 {
 
 	init_policy_queue();
 
 	/* initialize the "last resort" policy */
 	bzero(&V_defaultaddrpolicy, sizeof(V_defaultaddrpolicy));
 	V_defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;
 
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	ADDRSEL_LOCK_INIT();
 	ADDRSEL_SXLOCK_INIT();
 }
 
 static struct in6_addrpolicy *
 lookup_addrsel_policy(struct sockaddr_in6 *key)
 {
 	struct in6_addrpolicy *match = NULL;
 
 	ADDRSEL_LOCK();
 	match = match_addrsel_policy(key);
 
 	if (match == NULL)
 		match = &V_defaultaddrpolicy;
 	else
 		match->use++;
 	ADDRSEL_UNLOCK();
 
 	return (match);
 }
 
 /*
  * Subroutines to manage the address selection policy table via sysctl.
  */
 struct walkarg {
 	struct sysctl_req *w_req;
 };
 
 static int in6_src_sysctl(SYSCTL_HANDLER_ARGS);
 SYSCTL_DECL(_net_inet6_ip6);
 static SYSCTL_NODE(_net_inet6_ip6, IPV6CTL_ADDRCTLPOLICY, addrctlpolicy,
     CTLFLAG_RD | CTLFLAG_MPSAFE, in6_src_sysctl,
     "");
 
 static int
 in6_src_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct walkarg w;
 
 	if (req->newptr)
 		return EPERM;
 
 	bzero(&w, sizeof(w));
 	w.w_req = req;
 
 	return (walk_addrsel_policy(dump_addrsel_policyent, &w));
 }
 
 int
 in6_src_ioctl(u_long cmd, caddr_t data)
 {
 	struct in6_addrpolicy ent0;
 
 	if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
 		return (EOPNOTSUPP); /* check for safety */
 
 	ent0 = *(struct in6_addrpolicy *)data;
 
 	if (ent0.label == ADDR_LABEL_NOTAPP)
 		return (EINVAL);
 	/* check if the prefix mask is consecutive. */
 	if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
 		return (EINVAL);
 	/* clear trailing garbages (if any) of the prefix address. */
 	IN6_MASK_ADDR(&ent0.addr.sin6_addr, &ent0.addrmask.sin6_addr);
 	ent0.use = 0;
 
 	switch (cmd) {
 	case SIOCAADDRCTL_POLICY:
 		return (add_addrsel_policyent(&ent0));
 	case SIOCDADDRCTL_POLICY:
 		return (delete_addrsel_policyent(&ent0));
 	}
 
 	return (0);		/* XXX: compromise compilers */
 }
 
 /*
  * The followings are implementation of the policy table using a
  * simple tail queue.
  * XXX such details should be hidden.
  * XXX implementation using binary tree should be more efficient.
  */
 struct addrsel_policyent {
 	TAILQ_ENTRY(addrsel_policyent) ape_entry;
 	struct in6_addrpolicy ape_policy;
 };
 
 TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);
 
 VNET_DEFINE_STATIC(struct addrsel_policyhead, addrsel_policytab);
 #define	V_addrsel_policytab		VNET(addrsel_policytab)
 
 static void
 init_policy_queue(void)
 {
 
 	TAILQ_INIT(&V_addrsel_policytab);
 }
 
 static int
 add_addrsel_policyent(struct in6_addrpolicy *newpolicy)
 {
 	struct addrsel_policyent *new, *pol;
 
 	new = malloc(sizeof(*new), M_IFADDR,
 	       M_WAITOK);
 	ADDRSEL_XLOCK();
 	ADDRSEL_LOCK();
 
 	/* duplication check */
 	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
 		if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr,
 				       &pol->ape_policy.addr.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr,
 				       &pol->ape_policy.addrmask.sin6_addr)) {
 			ADDRSEL_UNLOCK();
 			ADDRSEL_XUNLOCK();
 			free(new, M_IFADDR);
 			return (EEXIST);	/* or override it? */
 		}
 	}
 
 	bzero(new, sizeof(*new));
 
 	/* XXX: should validate entry */
 	new->ape_policy = *newpolicy;
 
 	TAILQ_INSERT_TAIL(&V_addrsel_policytab, new, ape_entry);
 	ADDRSEL_UNLOCK();
 	ADDRSEL_XUNLOCK();
 
 	return (0);
 }
 
 static int
 delete_addrsel_policyent(struct in6_addrpolicy *key)
 {
 	struct addrsel_policyent *pol;
 
 	ADDRSEL_XLOCK();
 	ADDRSEL_LOCK();
 
 	/* search for the entry in the table */
 	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
 		if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr,
 		    &pol->ape_policy.addr.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr,
 		    &pol->ape_policy.addrmask.sin6_addr)) {
 			break;
 		}
 	}
 	if (pol == NULL) {
 		ADDRSEL_UNLOCK();
 		ADDRSEL_XUNLOCK();
 		return (ESRCH);
 	}
 
 	TAILQ_REMOVE(&V_addrsel_policytab, pol, ape_entry);
 	ADDRSEL_UNLOCK();
 	ADDRSEL_XUNLOCK();
 	free(pol, M_IFADDR);
 
 	return (0);
 }
 
 static int
 walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w)
 {
 	struct addrsel_policyent *pol;
 	int error = 0;
 
 	ADDRSEL_SLOCK();
 	TAILQ_FOREACH(pol, &V_addrsel_policytab, ape_entry) {
 		if ((error = (*callback)(&pol->ape_policy, w)) != 0) {
 			ADDRSEL_SUNLOCK();
 			return (error);
 		}
 	}
 	ADDRSEL_SUNLOCK();
 	return (error);
 }
 
 static int
 dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg)
 {
 	int error = 0;
 	struct walkarg *w = arg;
 
 	error = SYSCTL_OUT(w->w_req, pol, sizeof(*pol));
 
 	return (error);
 }
 
 static struct in6_addrpolicy *
 match_addrsel_policy(struct sockaddr_in6 *key)
 {
 	struct addrsel_policyent *pent;
 	struct in6_addrpolicy *bestpol = NULL, *pol;
 	int matchlen, bestmatchlen = -1;
 	u_char *mp, *ep, *k, *p, m;
 
 	TAILQ_FOREACH(pent, &V_addrsel_policytab, ape_entry) {
 		matchlen = 0;
 
 		pol = &pent->ape_policy;
 		mp = (u_char *)&pol->addrmask.sin6_addr;
 		ep = mp + 16;	/* XXX: scope field? */
 		k = (u_char *)&key->sin6_addr;
 		p = (u_char *)&pol->addr.sin6_addr;
 		for (; mp < ep && *mp; mp++, k++, p++) {
 			m = *mp;
 			if ((*k & m) != *p)
 				goto next; /* not match */
 			if (m == 0xff) /* short cut for a typical case */
 				matchlen += 8;
 			else {
 				while (m >= 0x80) {
 					matchlen++;
 					m <<= 1;
 				}
 			}
 		}
 
 		/* matched.  check if this is better than the current best. */
 		if (bestpol == NULL ||
 		    matchlen > bestmatchlen) {
 			bestpol = pol;
 			bestmatchlen = matchlen;
 		}
 
 	  next:
 		continue;
 	}
 
 	return (bestpol);
 }
diff --git a/sys/netinet6/ip6_fastfwd.c b/sys/netinet6/ip6_fastfwd.c
index e1c30629643e..7ecd08a4e450 100644
--- a/sys/netinet6/ip6_fastfwd.c
+++ b/sys/netinet6/ip6_fastfwd.c
@@ -1,304 +1,305 @@
 /*-
  * Copyright (c) 2014-2016 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 #include "opt_ipstealth.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 
 static int
 ip6_findroute(struct nhop_object **pnh, const struct sockaddr_in6 *dst,
     struct mbuf *m)
 {
 	struct nhop_object *nh;
 
 	nh = fib6_lookup(M_GETFIB(m), &dst->sin6_addr,
 	    dst->sin6_scope_id, NHR_NONE, m->m_pkthdr.flowid);
        if (nh == NULL) {
 		IP6STAT_INC(ip6s_noroute);
 		IP6STAT_INC(ip6s_cantforward);
 		icmp6_error(m, ICMP6_DST_UNREACH,
 		    ICMP6_DST_UNREACH_NOROUTE, 0);
 		return (EHOSTUNREACH);
 	}
 	if (nh->nh_flags & NHF_BLACKHOLE) {
 		IP6STAT_INC(ip6s_cantforward);
 		m_freem(m);
 		return (EHOSTUNREACH);
 	}
 
 	if (nh->nh_flags & NHF_REJECT) {
 		IP6STAT_INC(ip6s_cantforward);
 		icmp6_error(m, ICMP6_DST_UNREACH,
 		    ICMP6_DST_UNREACH_REJECT, 0);
 		return (EHOSTUNREACH);
 	}
 
 	*pnh = nh;
 
 	return (0);
 }
 
 struct mbuf*
 ip6_tryforward(struct mbuf *m)
 {
 	struct sockaddr_in6 dst;
 	struct nhop_object *nh;
 	struct m_tag *fwd_tag;
 	struct ip6_hdr *ip6;
 	struct ifnet *rcvif;
 	uint32_t plen;
 	int error;
 
 	/*
 	 * Fallback conditions to ip6_input for slow path processing.
 	 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	if ((m->m_flags & (M_BCAST | M_MCAST)) != 0 ||
 	    ip6->ip6_nxt == IPPROTO_HOPOPTS ||
 	    IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 	    IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_dst) ||
 	    IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src) ||
 	    IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) ||
 	    in6_localip(&ip6->ip6_dst))
 		return (m);
 	/*
 	 * Check that the amount of data in the buffers
 	 * is as at least much as the IPv6 header would have us expect.
 	 * Trim mbufs if longer than we expect.
 	 * Drop packet if shorter than we expect.
 	 */
 	rcvif = m->m_pkthdr.rcvif;
 	plen = ntohs(ip6->ip6_plen);
 	if (plen == 0) {
 		/*
 		 * Jumbograms must have hop-by-hop header and go via
 		 * slow path.
 		 */
 		IP6STAT_INC(ip6s_badoptions);
 		goto dropin;
 	}
 	if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
 		IP6STAT_INC(ip6s_tooshort);
 		in6_ifstat_inc(rcvif, ifs6_in_truncated);
 		goto dropin;
 	}
 	if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
 		if (m->m_len == m->m_pkthdr.len) {
 			m->m_len = sizeof(struct ip6_hdr) + plen;
 			m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
 		} else
 			m_adj(m, sizeof(struct ip6_hdr) + plen -
 			    m->m_pkthdr.len);
 	}
 
 	/*
 	 * Hop limit.
 	 */
 #ifdef IPSTEALTH
 	if (!V_ip6stealth)
 #endif
 	if (ip6->ip6_hlim <= IPV6_HLIMDEC) {
 		icmp6_error(m, ICMP6_TIME_EXCEEDED,
 		    ICMP6_TIME_EXCEED_TRANSIT, 0);
 		m = NULL;
 		goto dropin;
 	}
 
 	bzero(&dst, sizeof(dst));
 	dst.sin6_family = AF_INET6;
 	dst.sin6_len = sizeof(dst);
 	dst.sin6_addr = ip6->ip6_dst;
 
 	/*
 	 * Incoming packet firewall processing.
 	 */
 	if (!PFIL_HOOKED_IN(V_inet6_pfil_head))
 		goto passin;
 	if (pfil_mbuf_in(V_inet6_pfil_head, &m, rcvif, NULL) !=
 	    PFIL_PASS)
 		goto dropin;
 	/*
 	 * If packet filter sets the M_FASTFWD_OURS flag, this means
 	 * that new destination or next hop is our local address.
 	 * So, we can just go back to ip6_input.
 	 * XXX: should we decrement ip6_hlim in such case?
 	 *
 	 * Also it can forward packet to another destination, e.g.
 	 * M_IP6_NEXTHOP flag is set and fwd_tag is attached to mbuf.
 	 */
 	if (m->m_flags & M_FASTFWD_OURS)
 		return (m);
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		/*
 		 * Now we will find route to forwarded by pfil destination.
 		 */
 		bcopy((fwd_tag + 1), &dst, sizeof(dst));
 		m->m_flags &= ~M_IP6_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 	} else {
 		/* Update dst since pfil could change it */
 		dst.sin6_addr = ip6->ip6_dst;
 	}
 passin:
 	/*
 	 * Find route to destination.
 	 */
 	if (ip6_findroute(&nh, &dst, m) != 0) {
 		m = NULL;
 		in6_ifstat_inc(rcvif, ifs6_in_noroute);
 		goto dropin;
 	}
 	if (!PFIL_HOOKED_OUT(V_inet6_pfil_head)) {
 		if (m->m_pkthdr.len > nh->nh_mtu) {
 			in6_ifstat_inc(nh->nh_ifp, ifs6_in_toobig);
 			icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, nh->nh_mtu);
 			m = NULL;
 			goto dropout;
 		}
 		goto passout;
 	}
 
 	/*
 	 * Outgoing packet firewall processing.
 	 */
 	if (pfil_mbuf_out(V_inet6_pfil_head, &m, nh->nh_ifp,
 	    NULL) != PFIL_PASS)
 		goto dropout;
 
 	/*
 	 * We used slow path processing for packets with scoped addresses.
 	 * So, scope checks aren't needed here.
 	 */
 	if (m->m_pkthdr.len > nh->nh_mtu) {
 		in6_ifstat_inc(nh->nh_ifp, ifs6_in_toobig);
 		icmp6_error(m, ICMP6_PACKET_TOO_BIG, 0, nh->nh_mtu);
 		m = NULL;
 		goto dropout;
 	}
 
 	/*
 	 * If packet filter sets the M_FASTFWD_OURS flag, this means
 	 * that new destination or next hop is our local address.
 	 * So, we can just go back to ip6_input.
 	 *
 	 * Also it can forward packet to another destination, e.g.
 	 * M_IP6_NEXTHOP flag is set and fwd_tag is attached to mbuf.
 	 */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		/*
 		 * XXX: we did one hop and should decrement hop limit. But
 		 * now we are the destination and just don't pay attention.
 		 */
 		return (m);
 	}
 	/*
 	 * Again. A packet filter could change the destination address.
 	 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (m->m_flags & M_IP6_NEXTHOP)
 		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 	else
 		fwd_tag = NULL;
 
 	if (fwd_tag != NULL ||
 	    !IN6_ARE_ADDR_EQUAL(&dst.sin6_addr, &ip6->ip6_dst)) {
 		if (fwd_tag != NULL) {
 			bcopy((fwd_tag + 1), &dst, sizeof(dst));
 			m->m_flags &= ~M_IP6_NEXTHOP;
 			m_tag_delete(m, fwd_tag);
 		} else
 			dst.sin6_addr = ip6->ip6_dst;
 		/*
 		 * Redo route lookup with new destination address
 		 */
 		if (ip6_findroute(&nh, &dst, m) != 0) {
 			m = NULL;
 			goto dropout;
 		}
 	}
 passout:
 #ifdef IPSTEALTH
 	if (!V_ip6stealth)
 #endif
 	{
 		ip6->ip6_hlim -= IPV6_HLIMDEC;
 	}
 
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	IP_PROBE(send, NULL, NULL, ip6, nh->nh_ifp, NULL, ip6);
 
 	if (nh->nh_flags & NHF_GATEWAY)
 		dst.sin6_addr = nh->gw6_sa.sin6_addr;
 	error = (*nh->nh_ifp->if_output)(nh->nh_ifp, m,
 	    (struct sockaddr *)&dst, NULL);
 	if (error != 0) {
 		in6_ifstat_inc(nh->nh_ifp, ifs6_out_discard);
 		IP6STAT_INC(ip6s_cantforward);
 	} else {
 		in6_ifstat_inc(nh->nh_ifp, ifs6_out_forward);
 		IP6STAT_INC(ip6s_forward);
 	}
 	return (NULL);
 dropin:
 	in6_ifstat_inc(rcvif, ifs6_in_discard);
 	goto drop;
 dropout:
 	in6_ifstat_inc(nh->nh_ifp, ifs6_out_discard);
 drop:
 	if (m != NULL)
 		m_freem(m);
 	return (NULL);
 }
diff --git a/sys/netinet6/ip6_forward.c b/sys/netinet6/ip6_forward.c
index 39c93ac35427..a95e58ba09a1 100644
--- a/sys/netinet6/ip6_forward.c
+++ b/sys/netinet6/ip6_forward.c
@@ -1,449 +1,450 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_forward.c,v 1.69 2001/05/17 03:48:30 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_ipstealth.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/pfil.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 
 #include <netinet/in_pcb.h>
 
 #include <netipsec/ipsec_support.h>
 
 /*
  * Forward a packet.  If some error occurs return the sender
  * an icmp packet.  Note we can't always generate a meaningful
  * icmp message because icmp doesn't have a large enough repertoire
  * of codes and types.
  *
  * If not forwarding, just drop the packet.  This could be confusing
  * if ipforwarding was zero but some routing protocol was advancing
  * us as a gateway to somewhere.  However, we must let the routing
  * protocol deal with that.
  *
  */
 void
 ip6_forward(struct mbuf *m, int srcrt)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct sockaddr_in6 dst;
 	struct nhop_object *nh = NULL;
 	int error, type = 0, code = 0;
 	struct mbuf *mcopy = NULL;
 	struct ifnet *origifp;	/* maybe unnecessary */
 	u_int32_t inzone, outzone;
 	struct in6_addr odst;
 	struct m_tag *fwd_tag;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
 	/*
 	 * Do not forward packets to multicast destination (should be handled
 	 * by ip6_mforward().
 	 * Do not forward packets with unspecified source.  It was discussed
 	 * in July 2000, on the ipngwg mailing list.
 	 */
 	if ((m->m_flags & (M_BCAST|M_MCAST)) != 0 ||
 	    IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 	    IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 		IP6STAT_INC(ip6s_cantforward);
 		/* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */
 		if (V_ip6_log_time + V_ip6_log_interval < time_uptime) {
 			V_ip6_log_time = time_uptime;
 			log(LOG_DEBUG,
 			    "cannot forward "
 			    "from %s to %s nxt %d received on %s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 			    ip6->ip6_nxt,
 			    if_name(m->m_pkthdr.rcvif));
 		}
 		m_freem(m);
 		return;
 	}
 
 	if (
 #ifdef IPSTEALTH
 	    V_ip6stealth == 0 &&
 #endif
 	    ip6->ip6_hlim <= IPV6_HLIMDEC) {
 		/* XXX in6_ifstat_inc(rt->rt_ifp, ifs6_in_discard) */
 		icmp6_error(m, ICMP6_TIME_EXCEEDED,
 		    ICMP6_TIME_EXCEED_TRANSIT, 0);
 		return;
 	}
 
 	/*
 	 * Save at most ICMPV6_PLD_MAXLEN (= the min IPv6 MTU -
 	 * size of IPv6 + ICMPv6 headers) bytes of the packet in case
 	 * we need to generate an ICMP6 message to the src.
 	 * Thanks to M_EXT, in most cases copy will not occur.
 	 *
 	 * It is important to save it before IPsec processing as IPsec
 	 * processing may modify the mbuf.
 	 */
 	mcopy = m_copym(m, 0, imin(m->m_pkthdr.len, ICMPV6_PLD_MAXLEN),
 	    M_NOWAIT);
 #ifdef IPSTEALTH
 	if (V_ip6stealth == 0)
 #endif
 		ip6->ip6_hlim -= IPV6_HLIMDEC;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (IPSEC_ENABLED(ipv6)) {
 		if ((error = IPSEC_FORWARD(ipv6, m)) != 0) {
 			/* mbuf consumed by IPsec */
 			m_freem(mcopy);
 			if (error != EINPROGRESS)
 				IP6STAT_INC(ip6s_cantforward);
 			return;
 		}
 		/* No IPsec processing required */
 	}
 #endif
 	/*
 	 * ip6_forward() operates with IPv6 addresses with deembedded scope.
 	 *
 	 * There are 3 sources of IPv6 destination address:
 	 *
 	 * 1) ip6_input(), where ip6_dst contains deembedded address.
 	 *   In order to deal with forwarding of link-local packets,
 	 *   calculate the scope based on input interface (RFC 4007, clause 9).
 	 * 2) packet filters changing ip6_dst directly. It would embed scope
 	 *   for LL addresses, so in6_localip() performs properly.
 	 * 3) packet filters attaching PACKET_TAG_IPFORWARD would embed
 	 *   scope for the nexthop.
 	 */
 	bzero(&dst, sizeof(struct sockaddr_in6));
 	dst.sin6_family = AF_INET6;
 	dst.sin6_addr = ip6->ip6_dst;
 	dst.sin6_scope_id = in6_get_unicast_scopeid(&ip6->ip6_dst, m->m_pkthdr.rcvif);
 again:
 	nh = fib6_lookup(M_GETFIB(m), &dst.sin6_addr, dst.sin6_scope_id,
 	    NHR_REF, m->m_pkthdr.flowid);
 	if (nh == NULL) {
 		IP6STAT_INC(ip6s_noroute);
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute);
 		if (mcopy) {
 			icmp6_error(mcopy, ICMP6_DST_UNREACH,
 			ICMP6_DST_UNREACH_NOROUTE, 0);
 		}
 		goto bad;
 	}
 
 	if (nh->nh_flags & (NHF_BLACKHOLE | NHF_REJECT)) {
 		IP6STAT_INC(ip6s_cantforward);
 		if ((nh->nh_flags & NHF_REJECT) && (mcopy != NULL)) {
 			icmp6_error(mcopy, ICMP6_DST_UNREACH,
 			    ICMP6_DST_UNREACH_REJECT, 0);
 		}
 		goto bad;
 	}
 
 	/*
 	 * Source scope check: if a packet can't be delivered to its
 	 * destination for the reason that the destination is beyond the scope
 	 * of the source address, discard the packet and return an icmp6
 	 * destination unreachable error with Code 2 (beyond scope of source
 	 * address).
 	 * [draft-ietf-ipngwg-icmp-v3-04.txt, Section 3.1]
 	 */
 	outzone = in6_get_unicast_scopeid(&ip6->ip6_src, nh->nh_ifp);
 	inzone = in6_get_unicast_scopeid(&ip6->ip6_src, m->m_pkthdr.rcvif);
 	if (inzone != outzone) {
 		IP6STAT_INC(ip6s_cantforward);
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(nh->nh_ifp, ifs6_in_discard);
 
 		if (V_ip6_log_time + V_ip6_log_interval < time_uptime) {
 			V_ip6_log_time = time_uptime;
 			log(LOG_DEBUG,
 			    "cannot forward "
 			    "src %s, dst %s, nxt %d, rcvif %s, outif %s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 			    ip6->ip6_nxt,
 			    if_name(m->m_pkthdr.rcvif), if_name(nh->nh_ifp));
 		}
 		if (mcopy)
 			icmp6_error(mcopy, ICMP6_DST_UNREACH,
 				    ICMP6_DST_UNREACH_BEYONDSCOPE, 0);
 		goto bad;
 	}
 
 	/*
 	 * Destination scope check: if a packet is going to break the scope
 	 * zone of packet's destination address, discard it.  This case should
 	 * usually be prevented by appropriately-configured routing table, but
 	 * we need an explicit check because we may mistakenly forward the
 	 * packet to a different zone by (e.g.) a default route.
 	 */
 	inzone = in6_get_unicast_scopeid(&ip6->ip6_dst, m->m_pkthdr.rcvif);
 	outzone = in6_get_unicast_scopeid(&ip6->ip6_dst, nh->nh_ifp);
 
 	if (inzone != outzone) {
 		IP6STAT_INC(ip6s_cantforward);
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 
 	if (nh->nh_flags & NHF_GATEWAY) {
 		/* Store gateway address in deembedded form */
 		dst.sin6_addr = nh->gw6_sa.sin6_addr;
 		dst.sin6_scope_id = ntohs(in6_getscope(&dst.sin6_addr));
 		in6_clearscope(&dst.sin6_addr);
 	}
 
 	/*
 	 * If we are to forward the packet using the same interface
 	 * as one we got the packet from, perhaps we should send a redirect
 	 * to sender to shortcut a hop.
 	 * Only send redirect if source is sending directly to us,
 	 * and if packet was not source routed (or has any options).
 	 * Also, don't send redirect if forwarding using a route
 	 * modified by a redirect.
 	 */
 	if (V_ip6_sendredirects && nh->nh_ifp == m->m_pkthdr.rcvif && !srcrt &&
 	    (nh->nh_flags & NHF_REDIRECT) == 0)
 		type = ND_REDIRECT;
 
 	/*
 	 * Fake scoped addresses. Note that even link-local source or
 	 * destinaion can appear, if the originating node just sends the
 	 * packet to us (without address resolution for the destination).
 	 * Since both icmp6_error and icmp6_redirect_output fill the embedded
 	 * link identifiers, we can do this stuff after making a copy for
 	 * returning an error.
 	 */
 	if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
 		/*
 		 * See corresponding comments in ip6_output.
 		 * XXX: but is it possible that ip6_forward() sends a packet
 		 *      to a loopback interface? I don't think so, and thus
 		 *      I bark here. (jinmei@kame.net)
 		 * XXX: it is common to route invalid packets to loopback.
 		 *	also, the codepath will be visited on use of ::1 in
 		 *	rthdr. (itojun)
 		 */
 #if 1
 		if (0)
 #else
 		if ((rt->rt_flags & (RTF_BLACKHOLE|RTF_REJECT)) == 0)
 #endif
 		{
 			printf("ip6_forward: outgoing interface is loopback. "
 			       "src %s, dst %s, nxt %d, rcvif %s, outif %s\n",
 			       ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			       ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 			       ip6->ip6_nxt, if_name(m->m_pkthdr.rcvif),
 			       if_name(nh->nh_ifp));
 		}
 
 		/* we can just use rcvif in forwarding. */
 		origifp = m->m_pkthdr.rcvif;
 	}
 	else
 		origifp = nh->nh_ifp;
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED_OUT(V_inet6_pfil_head))
 		goto pass;
 
 	odst = ip6->ip6_dst;
 	/* Run through list of hooks for forwarded packets. */
 	if (pfil_mbuf_out(V_inet6_pfil_head, &m, nh->nh_ifp,
 	    NULL) != PFIL_PASS)
 		goto freecopy;
 	ip6 = mtod(m, struct ip6_hdr *);
 
 	/* See if destination IP address was changed by packet filter. */
 	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip6_input(). */
 		if (in6_localip(&ip6->ip6_dst))
 			m->m_flags |= M_FASTFWD_OURS;
 		else {
 			NH_FREE(nh);
 
 			/* Update address and scopeid. Assume scope is embedded */
 			dst.sin6_scope_id = ntohs(in6_getscope(&ip6->ip6_dst));
 			dst.sin6_addr = ip6->ip6_dst;
 			in6_clearscope(&dst.sin6_addr);
 			goto again;	/* Redo the routing table lookup. */
 		}
 	}
 
 	/* See if local, if yes, send it to netisr. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 			m->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		error = netisr_queue(NETISR_IPV6, m);
 		goto out;
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		struct sockaddr_in6 *gw6 = (struct sockaddr_in6 *)(fwd_tag + 1);
 
 		/* Update address and scopeid. Assume scope is embedded */
 		dst.sin6_scope_id = ntohs(in6_getscope(&gw6->sin6_addr));
 		dst.sin6_addr = gw6->sin6_addr;
 		in6_clearscope(&dst.sin6_addr);
 
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP6_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 		NH_FREE(nh);
 		goto again;
 	}
 
 pass:
 	/* See if the size was changed by the packet filter. */
 	/* TODO: change to nh->nh_mtu */
 	if (m->m_pkthdr.len > IN6_LINKMTU(nh->nh_ifp)) {
 		in6_ifstat_inc(nh->nh_ifp, ifs6_in_toobig);
 		if (mcopy)
 			icmp6_error(mcopy, ICMP6_PACKET_TOO_BIG, 0,
 			    IN6_LINKMTU(nh->nh_ifp));
 		goto bad;
 	}
 
 	/* Currently LLE layer stores embedded IPv6 addresses */
 	if (IN6_IS_SCOPE_LINKLOCAL(&dst.sin6_addr)) {
 		in6_set_unicast_scopeid(&dst.sin6_addr, dst.sin6_scope_id);
 		dst.sin6_scope_id = 0;
 	}
 	error = nd6_output_ifp(nh->nh_ifp, origifp, m, &dst, NULL);
 	if (error) {
 		in6_ifstat_inc(nh->nh_ifp, ifs6_out_discard);
 		IP6STAT_INC(ip6s_cantforward);
 	} else {
 		IP6STAT_INC(ip6s_forward);
 		in6_ifstat_inc(nh->nh_ifp, ifs6_out_forward);
 		if (type)
 			IP6STAT_INC(ip6s_redirectsent);
 		else {
 			if (mcopy)
 				goto freecopy;
 		}
 	}
 
 	if (mcopy == NULL)
 		goto out;
 	switch (error) {
 	case 0:
 		if (type == ND_REDIRECT) {
 			icmp6_redirect_output(mcopy, nh);
 			goto out;
 		}
 		goto freecopy;
 
 	case EMSGSIZE:
 		/* xxx MTU is constant in PPP? */
 		goto freecopy;
 
 	case ENOBUFS:
 		/* Tell source to slow down like source quench in IP? */
 		goto freecopy;
 
 	case ENETUNREACH:	/* shouldn't happen, checked above */
 	case EHOSTUNREACH:
 	case ENETDOWN:
 	case EHOSTDOWN:
 	default:
 		type = ICMP6_DST_UNREACH;
 		code = ICMP6_DST_UNREACH_ADDR;
 		break;
 	}
 	icmp6_error(mcopy, type, code, 0);
 	goto out;
 
  freecopy:
 	m_freem(mcopy);
 	goto out;
 bad:
 	m_freem(m);
 out:
 	if (nh != NULL)
 		NH_FREE(nh);
 }
diff --git a/sys/netinet6/ip6_gre.c b/sys/netinet6/ip6_gre.c
index 2bae6d754a03..8c9f7f5f668c 100644
--- a/sys/netinet6/ip6_gre.c
+++ b/sys/netinet6/ip6_gre.c
@@ -1,579 +1,580 @@
 /*-
  * Copyright (c) 2014, 2018 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #ifdef INET
 #include <net/ethernet.h>
 #include <netinet/ip.h>
 #endif
 #include <netinet/in_pcb.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/scope6_var.h>
 #include <net/if_gre.h>
 
 VNET_DEFINE(int, ip6_gre_hlim) = IPV6_DEFHLIM;
 #define	V_ip6_gre_hlim		VNET(ip6_gre_hlim)
 
 SYSCTL_DECL(_net_inet6_ip6);
 SYSCTL_INT(_net_inet6_ip6, OID_AUTO, grehlim, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip6_gre_hlim), 0, "Default hop limit for encapsulated packets");
 
 struct in6_gre_socket {
 	struct gre_socket	base;
 	struct in6_addr		addr; /* scope zone id is embedded */
 };
 VNET_DEFINE_STATIC(struct gre_sockets *, ipv6_sockets) = NULL;
 VNET_DEFINE_STATIC(struct gre_list *, ipv6_hashtbl) = NULL;
 VNET_DEFINE_STATIC(struct gre_list *, ipv6_srchashtbl) = NULL;
 #define	V_ipv6_sockets		VNET(ipv6_sockets)
 #define	V_ipv6_hashtbl		VNET(ipv6_hashtbl)
 #define	V_ipv6_srchashtbl	VNET(ipv6_srchashtbl)
 #define	GRE_HASH(src, dst)	(V_ipv6_hashtbl[\
     in6_gre_hashval((src), (dst)) & (GRE_HASH_SIZE - 1)])
 #define	GRE_SRCHASH(src)	(V_ipv6_srchashtbl[\
     fnv_32_buf((src), sizeof(*src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)])
 #define	GRE_SOCKHASH(src)	(V_ipv6_sockets[\
     fnv_32_buf((src), sizeof(*src), FNV1_32_INIT) & (GRE_HASH_SIZE - 1)])
 #define	GRE_HASH_SC(sc)		GRE_HASH(&(sc)->gre_oip6.ip6_src,\
     &(sc)->gre_oip6.ip6_dst)
 
 static uint32_t
 in6_gre_hashval(const struct in6_addr *src, const struct in6_addr *dst)
 {
 	uint32_t ret;
 
 	ret = fnv_32_buf(src, sizeof(*src), FNV1_32_INIT);
 	return (fnv_32_buf(dst, sizeof(*dst), ret));
 }
 
 static struct gre_socket*
 in6_gre_lookup_socket(const struct in6_addr *addr)
 {
 	struct gre_socket *gs;
 	struct in6_gre_socket *s;
 
 	CK_LIST_FOREACH(gs, &GRE_SOCKHASH(addr), chain) {
 		s = __containerof(gs, struct in6_gre_socket, base);
 		if (IN6_ARE_ADDR_EQUAL(&s->addr, addr))
 			break;
 	}
 	return (gs);
 }
 
 static int
 in6_gre_checkdup(const struct gre_softc *sc, const struct in6_addr *src,
     const struct in6_addr *dst, uint32_t opts)
 {
 	struct gre_list *head;
 	struct gre_softc *tmp;
 	struct gre_socket *gs;
 
 	if (sc->gre_family == AF_INET6 &&
 	    IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_src, src) &&
 	    IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst, dst) &&
 	    (sc->gre_options & GRE_UDPENCAP) == (opts & GRE_UDPENCAP))
 		return (EEXIST);
 
 	if (opts & GRE_UDPENCAP) {
 		gs = in6_gre_lookup_socket(src);
 		if (gs == NULL)
 			return (0);
 		head = &gs->list;
 	} else
 		head = &GRE_HASH(src, dst);
 
 	CK_LIST_FOREACH(tmp, head, chain) {
 		if (tmp == sc)
 			continue;
 		if (IN6_ARE_ADDR_EQUAL(&tmp->gre_oip6.ip6_src, src) &&
 		    IN6_ARE_ADDR_EQUAL(&tmp->gre_oip6.ip6_dst, dst))
 			return (EADDRNOTAVAIL);
 	}
 	return (0);
 }
 
 static int
 in6_gre_lookup(const struct mbuf *m, int off, int proto, void **arg)
 {
 	const struct ip6_hdr *ip6;
 	struct gre_softc *sc;
 
 	if (V_ipv6_hashtbl == NULL)
 		return (0);
 
 	NET_EPOCH_ASSERT();
 	ip6 = mtod(m, const struct ip6_hdr *);
 	CK_LIST_FOREACH(sc, &GRE_HASH(&ip6->ip6_dst, &ip6->ip6_src), chain) {
 		/*
 		 * This is an inbound packet, its ip6_dst is source address
 		 * in softc.
 		 */
 		if (IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_src,
 		    &ip6->ip6_dst) &&
 		    IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst,
 		    &ip6->ip6_src)) {
 			if ((GRE2IFP(sc)->if_flags & IFF_UP) == 0)
 				return (0);
 			*arg = sc;
 			return (ENCAP_DRV_LOOKUP);
 		}
 	}
 	return (0);
 }
 
 /*
  * Check that ingress address belongs to local host.
  */
 static void
 in6_gre_set_running(struct gre_softc *sc)
 {
 
 	if (in6_localip(&sc->gre_oip6.ip6_src))
 		GRE2IFP(sc)->if_drv_flags |= IFF_DRV_RUNNING;
 	else
 		GRE2IFP(sc)->if_drv_flags &= ~IFF_DRV_RUNNING;
 }
 
 /*
  * ifaddr_event handler.
  * Clear IFF_DRV_RUNNING flag when ingress address disappears to prevent
  * source address spoofing.
  */
 static void
 in6_gre_srcaddr(void *arg __unused, const struct sockaddr *sa,
     int event __unused)
 {
 	const struct sockaddr_in6 *sin;
 	struct gre_softc *sc;
 
 	/* Check that VNET is ready */
 	if (V_ipv6_hashtbl == NULL)
 		return;
 
 	NET_EPOCH_ASSERT();
 	sin = (const struct sockaddr_in6 *)sa;
 	CK_LIST_FOREACH(sc, &GRE_SRCHASH(&sin->sin6_addr), srchash) {
 		if (IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_src,
 		    &sin->sin6_addr) == 0)
 			continue;
 		in6_gre_set_running(sc);
 	}
 }
 
 static bool
 in6_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp,
     const struct sockaddr *sa, void *ctx)
 {
 	struct gre_socket *gs;
 	struct gre_softc *sc;
 	struct sockaddr_in6 dst;
 
 	NET_EPOCH_ASSERT();
 
 	gs = (struct gre_socket *)ctx;
 	dst = *(const struct sockaddr_in6 *)sa;
 	if (sa6_embedscope(&dst, 0)) {
 		m_freem(m);
 		return (true);
 	}
 	CK_LIST_FOREACH(sc, &gs->list, chain) {
 		if (IN6_ARE_ADDR_EQUAL(&sc->gre_oip6.ip6_dst, &dst.sin6_addr))
 			break;
 	}
 	if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){
 		gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc);
 		return (true);
 	}
 	m_freem(m);
 
 	return (true);
 }
 
 static int
 in6_gre_setup_socket(struct gre_softc *sc)
 {
 	struct sockopt sopt;
 	struct sockaddr_in6 sin6;
 	struct in6_gre_socket *s;
 	struct gre_socket *gs;
 	int error, value;
 
 	/*
 	 * NOTE: we are protected with gre_ioctl_sx lock.
 	 *
 	 * First check that socket is already configured.
 	 * If so, check that source address was not changed.
 	 * If address is different, check that there are no other tunnels
 	 * and close socket.
 	 */
 	gs = sc->gre_so;
 	if (gs != NULL) {
 		s = __containerof(gs, struct in6_gre_socket, base);
 		if (!IN6_ARE_ADDR_EQUAL(&s->addr, &sc->gre_oip6.ip6_src)) {
 			if (CK_LIST_EMPTY(&gs->list)) {
 				CK_LIST_REMOVE(gs, chain);
 				soclose(gs->so);
 				NET_EPOCH_CALL(gre_sofree, &gs->epoch_ctx);
 			}
 			gs = sc->gre_so = NULL;
 		}
 	}
 
 	if (gs == NULL) {
 		/*
 		 * Check that socket for given address is already
 		 * configured.
 		 */
 		gs = in6_gre_lookup_socket(&sc->gre_oip6.ip6_src);
 		if (gs == NULL) {
 			s = malloc(sizeof(*s), M_GRE, M_WAITOK | M_ZERO);
 			s->addr = sc->gre_oip6.ip6_src;
 			gs = &s->base;
 
 			error = socreate(sc->gre_family, &gs->so,
 			    SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred,
 			    curthread);
 			if (error != 0) {
 				if_printf(GRE2IFP(sc),
 				    "cannot create socket: %d\n", error);
 				free(s, M_GRE);
 				return (error);
 			}
 
 			error = udp_set_kernel_tunneling(gs->so,
 			    in6_gre_udp_input, NULL, gs);
 			if (error != 0) {
 				if_printf(GRE2IFP(sc),
 				    "cannot set UDP tunneling: %d\n", error);
 				goto fail;
 			}
 
 			memset(&sopt, 0, sizeof(sopt));
 			sopt.sopt_dir = SOPT_SET;
 			sopt.sopt_level = IPPROTO_IPV6;
 			sopt.sopt_name = IPV6_BINDANY;
 			sopt.sopt_val = &value;
 			sopt.sopt_valsize = sizeof(value);
 			value = 1;
 			error = sosetopt(gs->so, &sopt);
 			if (error != 0) {
 				if_printf(GRE2IFP(sc),
 				    "cannot set IPV6_BINDANY opt: %d\n",
 				    error);
 				goto fail;
 			}
 
 			memset(&sin6, 0, sizeof(sin6));
 			sin6.sin6_family = AF_INET6;
 			sin6.sin6_len = sizeof(sin6);
 			sin6.sin6_addr = sc->gre_oip6.ip6_src;
 			sin6.sin6_port = htons(GRE_UDPPORT);
 			error = sa6_recoverscope(&sin6);
 			if (error != 0) {
 				if_printf(GRE2IFP(sc),
 				    "cannot determine scope zone id: %d\n",
 				    error);
 				goto fail;
 			}
 			error = sobind(gs->so, (struct sockaddr *)&sin6,
 			    curthread);
 			if (error != 0) {
 				if_printf(GRE2IFP(sc),
 				    "cannot bind socket: %d\n", error);
 				goto fail;
 			}
 			/* Add socket to the chain */
 			CK_LIST_INSERT_HEAD(
 			    &GRE_SOCKHASH(&sc->gre_oip6.ip6_src), gs, chain);
 		}
 	}
 
 	/* Add softc to the socket's list */
 	CK_LIST_INSERT_HEAD(&gs->list, sc, chain);
 	sc->gre_so = gs;
 	return (0);
 fail:
 	soclose(gs->so);
 	free(s, M_GRE);
 	return (error);
 }
 
 static int
 in6_gre_attach(struct gre_softc *sc)
 {
 	struct grehdr *gh;
 	int error;
 
 	if (sc->gre_options & GRE_UDPENCAP) {
 		sc->gre_csumflags = CSUM_UDP_IPV6;
 		sc->gre_hlen = sizeof(struct greudp6);
 		sc->gre_oip6.ip6_nxt = IPPROTO_UDP;
 		gh = &sc->gre_udp6hdr->gi6_gre;
 		gre_update_udphdr(sc, &sc->gre_udp6,
 		    in6_cksum_pseudo(&sc->gre_oip6, 0, 0, 0));
 	} else {
 		sc->gre_hlen = sizeof(struct greip6);
 		sc->gre_oip6.ip6_nxt = IPPROTO_GRE;
 		gh = &sc->gre_ip6hdr->gi6_gre;
 	}
 	sc->gre_oip6.ip6_vfc = IPV6_VERSION;
 	gre_update_hdr(sc, gh);
 
 	/*
 	 * If we return error, this means that sc is not linked,
 	 * and caller should reset gre_family and free(sc->gre_hdr).
 	 */
 	if (sc->gre_options & GRE_UDPENCAP) {
 		error = in6_gre_setup_socket(sc);
 		if (error != 0)
 			return (error);
 	} else
 		CK_LIST_INSERT_HEAD(&GRE_HASH_SC(sc), sc, chain);
 	CK_LIST_INSERT_HEAD(&GRE_SRCHASH(&sc->gre_oip6.ip6_src), sc, srchash);
 
 	/* Set IFF_DRV_RUNNING if interface is ready */
 	in6_gre_set_running(sc);
 	return (0);
 }
 
 int
 in6_gre_setopts(struct gre_softc *sc, u_long cmd, uint32_t value)
 {
 	int error;
 
 	/* NOTE: we are protected with gre_ioctl_sx lock */
 	MPASS(cmd == GRESKEY || cmd == GRESOPTS || cmd == GRESPORT);
 	MPASS(sc->gre_family == AF_INET6);
 
 	/*
 	 * If we are going to change encapsulation protocol, do check
 	 * for duplicate tunnels. Return EEXIST here to do not confuse
 	 * user.
 	 */
 	if (cmd == GRESOPTS &&
 	    (sc->gre_options & GRE_UDPENCAP) != (value & GRE_UDPENCAP) &&
 	    in6_gre_checkdup(sc, &sc->gre_oip6.ip6_src,
 		&sc->gre_oip6.ip6_dst, value) == EADDRNOTAVAIL)
 		return (EEXIST);
 
 	CK_LIST_REMOVE(sc, chain);
 	CK_LIST_REMOVE(sc, srchash);
 	GRE_WAIT();
 	switch (cmd) {
 	case GRESKEY:
 		sc->gre_key = value;
 		break;
 	case GRESOPTS:
 		sc->gre_options = value;
 		break;
 	case GRESPORT:
 		sc->gre_port = value;
 		break;
 	}
 	error = in6_gre_attach(sc);
 	if (error != 0) {
 		sc->gre_family = 0;
 		free(sc->gre_hdr, M_GRE);
 	}
 	return (error);
 }
 
 int
 in6_gre_ioctl(struct gre_softc *sc, u_long cmd, caddr_t data)
 {
 	struct in6_ifreq *ifr = (struct in6_ifreq *)data;
 	struct sockaddr_in6 *dst, *src;
 	struct ip6_hdr *ip6;
 	int error;
 
 	/* NOTE: we are protected with gre_ioctl_sx lock */
 	error = EINVAL;
 	switch (cmd) {
 	case SIOCSIFPHYADDR_IN6:
 		src = &((struct in6_aliasreq *)data)->ifra_addr;
 		dst = &((struct in6_aliasreq *)data)->ifra_dstaddr;
 
 		/* sanity checks */
 		if (src->sin6_family != dst->sin6_family ||
 		    src->sin6_family != AF_INET6 ||
 		    src->sin6_len != dst->sin6_len ||
 		    src->sin6_len != sizeof(*src))
 			break;
 		if (IN6_IS_ADDR_UNSPECIFIED(&src->sin6_addr) ||
 		    IN6_IS_ADDR_UNSPECIFIED(&dst->sin6_addr)) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		/*
 		 * Check validity of the scope zone ID of the
 		 * addresses, and convert it into the kernel
 		 * internal form if necessary.
 		 */
 		if ((error = sa6_embedscope(src, 0)) != 0 ||
 		    (error = sa6_embedscope(dst, 0)) != 0)
 			break;
 
 		if (V_ipv6_hashtbl == NULL) {
 			V_ipv6_hashtbl = gre_hashinit();
 			V_ipv6_srchashtbl = gre_hashinit();
 			V_ipv6_sockets = (struct gre_sockets *)gre_hashinit();
 		}
 		error = in6_gre_checkdup(sc, &src->sin6_addr,
 		    &dst->sin6_addr, sc->gre_options);
 		if (error == EADDRNOTAVAIL)
 			break;
 		if (error == EEXIST) {
 			/* Addresses are the same. Just return. */
 			error = 0;
 			break;
 		}
 		ip6 = malloc(sizeof(struct greudp6) + 3 * sizeof(uint32_t),
 		    M_GRE, M_WAITOK | M_ZERO);
 		ip6->ip6_src = src->sin6_addr;
 		ip6->ip6_dst = dst->sin6_addr;
 		if (sc->gre_family != 0) {
 			/* Detach existing tunnel first */
 			CK_LIST_REMOVE(sc, chain);
 			CK_LIST_REMOVE(sc, srchash);
 			GRE_WAIT();
 			free(sc->gre_hdr, M_GRE);
 			/* XXX: should we notify about link state change? */
 		}
 		sc->gre_family = AF_INET6;
 		sc->gre_hdr = ip6;
 		sc->gre_oseq = 0;
 		sc->gre_iseq = UINT32_MAX;
 		error = in6_gre_attach(sc);
 		if (error != 0) {
 			sc->gre_family = 0;
 			free(sc->gre_hdr, M_GRE);
 		}
 		break;
 	case SIOCGIFPSRCADDR_IN6:
 	case SIOCGIFPDSTADDR_IN6:
 		if (sc->gre_family != AF_INET6) {
 			error = EADDRNOTAVAIL;
 			break;
 		}
 		src = (struct sockaddr_in6 *)&ifr->ifr_addr;
 		memset(src, 0, sizeof(*src));
 		src->sin6_family = AF_INET6;
 		src->sin6_len = sizeof(*src);
 		src->sin6_addr = (cmd == SIOCGIFPSRCADDR_IN6) ?
 		    sc->gre_oip6.ip6_src: sc->gre_oip6.ip6_dst;
 		error = prison_if(curthread->td_ucred, (struct sockaddr *)src);
 		if (error == 0)
 			error = sa6_recoverscope(src);
 		if (error != 0)
 			memset(src, 0, sizeof(*src));
 		break;
 	}
 	return (error);
 }
 
 int
 in6_gre_output(struct mbuf *m, int af __unused, int hlen __unused,
     uint32_t flowid)
 {
 	struct greip6 *gi6;
 
 	gi6 = mtod(m, struct greip6 *);
 	gi6->gi6_ip6.ip6_hlim = V_ip6_gre_hlim;
 	gi6->gi6_ip6.ip6_flow |= flowid & IPV6_FLOWLABEL_MASK;
 	return (ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL, NULL));
 }
 
 static const struct srcaddrtab *ipv6_srcaddrtab = NULL;
 static const struct encaptab *ecookie = NULL;
 static const struct encap_config ipv6_encap_cfg = {
 	.proto = IPPROTO_GRE,
 	.min_length = sizeof(struct greip6) +
 #ifdef INET
 	    sizeof(struct ip),
 #else
 	    sizeof(struct ip6_hdr),
 #endif
 	.exact_match = ENCAP_DRV_LOOKUP,
 	.lookup = in6_gre_lookup,
 	.input = gre_input
 };
 
 void
 in6_gre_init(void)
 {
 
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 	ipv6_srcaddrtab = ip6_encap_register_srcaddr(in6_gre_srcaddr,
 	    NULL, M_WAITOK);
 	ecookie = ip6_encap_attach(&ipv6_encap_cfg, NULL, M_WAITOK);
 }
 
 void
 in6_gre_uninit(void)
 {
 
 	if (IS_DEFAULT_VNET(curvnet)) {
 		ip6_encap_detach(ecookie);
 		ip6_encap_unregister_srcaddr(ipv6_srcaddrtab);
 	}
 	if (V_ipv6_hashtbl != NULL) {
 		gre_hashdestroy(V_ipv6_hashtbl);
 		V_ipv6_hashtbl = NULL;
 		GRE_WAIT();
 		gre_hashdestroy(V_ipv6_srchashtbl);
 		gre_hashdestroy((struct gre_list *)V_ipv6_sockets);
 	}
 }
diff --git a/sys/netinet6/ip6_mroute.c b/sys/netinet6/ip6_mroute.c
index 9465c6662018..e690cb64894f 100644
--- a/sys/netinet6/ip6_mroute.c
+++ b/sys/netinet6/ip6_mroute.c
@@ -1,1945 +1,1946 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_mroute.c,v 1.58 2001/12/18 02:36:31 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1989 Stephen Deering
  * Copyright (c) 1992, 1993
  *      The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_mroute.c	8.2 (Berkeley) 11/15/93
  *	BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp
  */
 
 /*
  * IP multicast forwarding procedures
  *
  * Written by David Waitzman, BBN Labs, August 1988.
  * Modified by Steve Deering, Stanford, February 1989.
  * Modified by Mark J. Steiglitz, Stanford, May, 1991
  * Modified by Van Jacobson, LBL, January 1993
  * Modified by Ajit Thyagarajan, PARC, August 1993
  * Modified by Bill Fenner, PARC, April 1994
  *
  * MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/callout.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/icmp6.h>
 #include <netinet/ip_encap.h>
 
 #include <netinet/ip6.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/pim6.h>
 #include <netinet6/pim6_var.h>
 
 static MALLOC_DEFINE(M_MRTABLE6, "mf6c", "multicast forwarding cache entry");
 
 static int	ip6_mdq(struct mbuf *, struct ifnet *, struct mf6c *);
 static void	phyint_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);
 static int	register_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);
 static int	set_pim6(int *);
 static int	socket_send(struct socket *, struct mbuf *,
 		    struct sockaddr_in6 *);
 
 extern int in6_mcast_loop;
 extern struct domain inet6domain;
 
 static const struct encaptab *pim6_encap_cookie;
 static int pim6_encapcheck(const struct mbuf *, int, int, void *);
 static int pim6_input(struct mbuf *, int, int, void *);
 
 static const struct encap_config ipv6_encap_cfg = {
 	.proto = IPPROTO_PIM,
 	.min_length = sizeof(struct ip6_hdr) + PIM_MINLEN,
 	.exact_match = 8,
 	.check = pim6_encapcheck,
 	.input = pim6_input
 };
 
 VNET_DEFINE_STATIC(int, ip6_mrouter_ver) = 0;
 #define	V_ip6_mrouter_ver	VNET(ip6_mrouter_ver)
 
 SYSCTL_DECL(_net_inet6);
 SYSCTL_DECL(_net_inet6_ip6);
 static SYSCTL_NODE(_net_inet6, IPPROTO_PIM, pim,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "PIM");
 
 static struct mrt6stat mrt6stat;
 SYSCTL_STRUCT(_net_inet6_ip6, OID_AUTO, mrt6stat, CTLFLAG_RW,
     &mrt6stat, mrt6stat,
     "Multicast Routing Statistics (struct mrt6stat, netinet6/ip6_mroute.h)");
 
 #define	MRT6STAT_INC(name)	mrt6stat.name += 1
 #define NO_RTE_FOUND	0x1
 #define RTE_FOUND	0x2
 
 static struct sx mrouter6_mtx;
 #define	MROUTER6_LOCKPTR()	(&mrouter6_mtx)
 #define	MROUTER6_LOCK()		sx_xlock(MROUTER6_LOCKPTR())
 #define	MROUTER6_UNLOCK()	sx_xunlock(MROUTER6_LOCKPTR())
 #define	MROUTER6_LOCK_ASSERT()	sx_assert(MROUTER6_LOCKPTR(), SA_XLOCKED
 #define	MROUTER6_LOCK_INIT()	sx_init(MROUTER6_LOCKPTR(), "mrouter6")
 #define	MROUTER6_LOCK_DESTROY()	sx_destroy(MROUTER6_LOCKPTR())
 
 static struct mf6c *mf6ctable[MF6CTBLSIZ];
 SYSCTL_OPAQUE(_net_inet6_ip6, OID_AUTO, mf6ctable, CTLFLAG_RD,
     &mf6ctable, sizeof(mf6ctable), "S,*mf6ctable[MF6CTBLSIZ]",
     "IPv6 Multicast Forwarding Table (struct *mf6ctable[MF6CTBLSIZ], "
     "netinet6/ip6_mroute.h)");
 
 static struct mtx mfc6_mtx;
 #define	MFC6_LOCKPTR()		(&mfc6_mtx)
 #define	MFC6_LOCK()		mtx_lock(MFC6_LOCKPTR())
 #define	MFC6_UNLOCK()		mtx_unlock(MFC6_LOCKPTR())
 #define	MFC6_LOCK_ASSERT()	mtx_assert(MFC6_LOCKPTR(), MA_OWNED)
 #define	MFC6_LOCK_INIT()	mtx_init(MFC6_LOCKPTR(),		\
 				    "IPv6 multicast forwarding cache",	\
 				    NULL, MTX_DEF)
 #define	MFC6_LOCK_DESTROY()	mtx_destroy(MFC6_LOCKPTR())
 
 static u_char n6expire[MF6CTBLSIZ];
 
 static struct mif6 mif6table[MAXMIFS];
 static int
 sysctl_mif6table(SYSCTL_HANDLER_ARGS)
 {
 	struct mif6_sctl *out;
 	int error;
 
 	out = malloc(sizeof(struct mif6_sctl) * MAXMIFS, M_TEMP,
 	    M_WAITOK | M_ZERO);
 	for (int i = 0; i < MAXMIFS; i++) {
 		out[i].m6_flags		= mif6table[i].m6_flags;
 		out[i].m6_rate_limit	= mif6table[i].m6_rate_limit;
 		out[i].m6_lcl_addr	= mif6table[i].m6_lcl_addr;
 		if (mif6table[i].m6_ifp != NULL)
 			out[i].m6_ifp	= mif6table[i].m6_ifp->if_index;
 		else
 			out[i].m6_ifp	= 0;
 		out[i].m6_pkt_in	= mif6table[i].m6_pkt_in;
 		out[i].m6_pkt_out	= mif6table[i].m6_pkt_out;
 		out[i].m6_bytes_in	= mif6table[i].m6_bytes_in;
 		out[i].m6_bytes_out	= mif6table[i].m6_bytes_out;
 	}
 	error = SYSCTL_OUT(req, out, sizeof(struct mif6_sctl) * MAXMIFS);
 	free(out, M_TEMP);
 	return (error);
 }
 SYSCTL_PROC(_net_inet6_ip6, OID_AUTO, mif6table,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
     NULL, 0, sysctl_mif6table, "S,mif6_sctl[MAXMIFS]",
     "IPv6 Multicast Interfaces (struct mif6_sctl[MAXMIFS], "
     "netinet6/ip6_mroute.h)");
 
 static struct mtx mif6_mtx;
 #define	MIF6_LOCKPTR()		(&mif6_mtx)
 #define	MIF6_LOCK()		mtx_lock(MIF6_LOCKPTR())
 #define	MIF6_UNLOCK()		mtx_unlock(MIF6_LOCKPTR())
 #define	MIF6_LOCK_ASSERT()	mtx_assert(MIF6_LOCKPTR(), MA_OWNED)
 #define	MIF6_LOCK_INIT()	\
 	mtx_init(MIF6_LOCKPTR(), "IPv6 multicast interfaces", NULL, MTX_DEF)
 #define	MIF6_LOCK_DESTROY()	mtx_destroy(MIF6_LOCKPTR())
 
 #ifdef MRT6DEBUG
 VNET_DEFINE_STATIC(u_int, mrt6debug) = 0;	/* debug level */
 #define	V_mrt6debug		VNET(mrt6debug)
 #define DEBUG_MFC	0x02
 #define DEBUG_FORWARD	0x04
 #define DEBUG_EXPIRE	0x08
 #define DEBUG_XMIT	0x10
 #define DEBUG_REG	0x20
 #define DEBUG_PIM	0x40
 #define	DEBUG_ERR	0x80
 #define	DEBUG_ANY	0x7f
 #define	MRT6_DLOG(m, fmt, ...)	\
 	if (V_mrt6debug & (m))	\
 		log(((m) & DEBUG_ERR) ? LOG_ERR: LOG_DEBUG, \
 		    "%s: " fmt "\n", __func__, ##__VA_ARGS__)
 #else
 #define	MRT6_DLOG(m, fmt, ...)
 #endif
 
 static void	expire_upcalls(void *);
 #define	EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second */
 #define	UPCALL_EXPIRE	6		/* number of timeouts */
 
 /*
  * XXX TODO: maintain a count to if_allmulti() calls in struct ifnet.
  */
 
 /*
  * 'Interfaces' associated with decapsulator (so we can tell
  * packets that went through it from ones that get reflected
  * by a broken gateway).  Different from IPv4 register_if,
  * these interfaces are linked into the system ifnet list,
  * because per-interface IPv6 statistics are maintained in
  * ifp->if_afdata.  But it does not have any routes point
  * to them.  I.e., packets can't be sent this way.  They
  * only exist as a placeholder for multicast source
  * verification.
  */
 static struct ifnet *multicast_register_if6;
 
 #define ENCAP_HOPS 64
 
 /*
  * Private variables.
  */
 static mifi_t nummifs = 0;
 static mifi_t reg_mif_num = (mifi_t)-1;
 
 static struct pim6stat pim6stat;
 SYSCTL_STRUCT(_net_inet6_pim, PIM6CTL_STATS, stats, CTLFLAG_RW,
     &pim6stat, pim6stat,
     "PIM Statistics (struct pim6stat, netinet6/pim6_var.h)");
 
 #define	PIM6STAT_INC(name)	pim6stat.name += 1
 VNET_DEFINE_STATIC(int, pim6);
 #define	V_pim6		VNET(pim6)
 
 /*
  * Hash function for a source, group entry
  */
 #define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \
 				   (a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \
 				   (g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \
 				   (g).s6_addr32[2] ^ (g).s6_addr32[3])
 
 /*
  * Find a route for a given origin IPv6 address and Multicast group address.
  */
 #define MF6CFIND(o, g, rt) do { \
 	struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \
 	rt = NULL; \
 	while (_rt) { \
 		if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \
 		    IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \
 		    (_rt->mf6c_stall == NULL)) { \
 			rt = _rt; \
 			break; \
 		} \
 		_rt = _rt->mf6c_next; \
 	} \
 	if (rt == NULL) { \
 		MRT6STAT_INC(mrt6s_mfc_misses); \
 	} \
 } while (/*CONSTCOND*/ 0)
 
 /*
  * Macros to compute elapsed time efficiently
  * Borrowed from Van Jacobson's scheduling code
  * XXX: replace with timersub() ?
  */
 #define TV_DELTA(a, b, delta) do { \
 	    int xxs; \
 		\
 	    delta = (a).tv_usec - (b).tv_usec; \
 	    if ((xxs = (a).tv_sec - (b).tv_sec)) { \
 	       switch (xxs) { \
 		      case 2: \
 			  delta += 1000000; \
 			      /* FALLTHROUGH */ \
 		      case 1: \
 			  delta += 1000000; \
 			  break; \
 		      default: \
 			  delta += (1000000 * xxs); \
 	       } \
 	    } \
 } while (/*CONSTCOND*/ 0)
 
 /* XXX: replace with timercmp(a, b, <) ? */
 #define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
 	      (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
 
 #ifdef UPCALL_TIMING
 #define UPCALL_MAX	50
 static u_long upcall_data[UPCALL_MAX + 1];
 static void collate();
 #endif /* UPCALL_TIMING */
 
 static int ip6_mrouter_init(struct socket *, int, int);
 static int add_m6fc(struct mf6cctl *);
 static int add_m6if(struct mif6ctl *);
 static int del_m6fc(struct mf6cctl *);
 static int del_m6if(mifi_t *);
 static int del_m6if_locked(mifi_t *);
 static int get_mif6_cnt(struct sioc_mif_req6 *);
 static int get_sg_cnt(struct sioc_sg_req6 *);
 
 static struct callout expire_upcalls_ch;
 
 int X_ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *);
 int X_ip6_mrouter_done(void);
 int X_ip6_mrouter_set(struct socket *, struct sockopt *);
 int X_ip6_mrouter_get(struct socket *, struct sockopt *);
 int X_mrt6_ioctl(u_long, caddr_t);
 
 /*
  * Handle MRT setsockopt commands to modify the multicast routing tables.
  */
 int
 X_ip6_mrouter_set(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0;
 	int optval;
 	struct mif6ctl mifc;
 	struct mf6cctl mfcc;
 	mifi_t mifi;
 
 	if (so != V_ip6_mrouter && sopt->sopt_name != MRT6_INIT)
 		return (EPERM);
 
 	switch (sopt->sopt_name) {
 	case MRT6_INIT:
 #ifdef MRT6_OINIT
 	case MRT6_OINIT:
 #endif
 		error = sooptcopyin(sopt, &optval, sizeof(optval),
 		    sizeof(optval));
 		if (error)
 			break;
 		error = ip6_mrouter_init(so, optval, sopt->sopt_name);
 		break;
 	case MRT6_DONE:
 		error = X_ip6_mrouter_done();
 		break;
 	case MRT6_ADD_MIF:
 		error = sooptcopyin(sopt, &mifc, sizeof(mifc), sizeof(mifc));
 		if (error)
 			break;
 		error = add_m6if(&mifc);
 		break;
 	case MRT6_ADD_MFC:
 		error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc));
 		if (error)
 			break;
 		error = add_m6fc(&mfcc);
 		break;
 	case MRT6_DEL_MFC:
 		error = sooptcopyin(sopt, &mfcc, sizeof(mfcc), sizeof(mfcc));
 		if (error)
 			break;
 		error = del_m6fc(&mfcc);
 		break;
 	case MRT6_DEL_MIF:
 		error = sooptcopyin(sopt, &mifi, sizeof(mifi), sizeof(mifi));
 		if (error)
 			break;
 		error = del_m6if(&mifi);
 		break;
 	case MRT6_PIM:
 		error = sooptcopyin(sopt, &optval, sizeof(optval),
 		    sizeof(optval));
 		if (error)
 			break;
 		error = set_pim6(&optval);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Handle MRT getsockopt commands
  */
 int
 X_ip6_mrouter_get(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0;
 
 	if (so != V_ip6_mrouter)
 		return (EACCES);
 
 	switch (sopt->sopt_name) {
 		case MRT6_PIM:
 			error = sooptcopyout(sopt, &V_pim6, sizeof(V_pim6));
 			break;
 	}
 	return (error);
 }
 
 /*
  * Handle ioctl commands to obtain information from the cache
  */
 int
 X_mrt6_ioctl(u_long cmd, caddr_t data)
 {
 	int ret;
 
 	ret = EINVAL;
 
 	switch (cmd) {
 	case SIOCGETSGCNT_IN6:
 		ret = get_sg_cnt((struct sioc_sg_req6 *)data);
 		break;
 
 	case SIOCGETMIFCNT_IN6:
 		ret = get_mif6_cnt((struct sioc_mif_req6 *)data);
 		break;
 
 	default:
 		break;
 	}
 
 	return (ret);
 }
 
 /*
  * returns the packet, byte, rpf-failure count for the source group provided
  */
 static int
 get_sg_cnt(struct sioc_sg_req6 *req)
 {
 	struct mf6c *rt;
 	int ret;
 
 	ret = 0;
 
 	MFC6_LOCK();
 
 	MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt);
 	if (rt == NULL) {
 		ret = ESRCH;
 	} else {
 		req->pktcnt = rt->mf6c_pkt_cnt;
 		req->bytecnt = rt->mf6c_byte_cnt;
 		req->wrong_if = rt->mf6c_wrong_if;
 	}
 
 	MFC6_UNLOCK();
 
 	return (ret);
 }
 
 /*
  * returns the input and output packet and byte counts on the mif provided
  */
 static int
 get_mif6_cnt(struct sioc_mif_req6 *req)
 {
 	mifi_t mifi;
 	int ret;
 
 	ret = 0;
 	mifi = req->mifi;
 
 	MIF6_LOCK();
 
 	if (mifi >= nummifs) {
 		ret = EINVAL;
 	} else {
 		req->icount = mif6table[mifi].m6_pkt_in;
 		req->ocount = mif6table[mifi].m6_pkt_out;
 		req->ibytes = mif6table[mifi].m6_bytes_in;
 		req->obytes = mif6table[mifi].m6_bytes_out;
 	}
 
 	MIF6_UNLOCK();
 
 	return (ret);
 }
 
 static int
 set_pim6(int *i)
 {
 	if ((*i != 1) && (*i != 0))
 		return (EINVAL);
 
 	V_pim6 = *i;
 
 	return (0);
 }
 
 /*
  * Enable multicast routing
  */
 static int
 ip6_mrouter_init(struct socket *so, int v, int cmd)
 {
 
 	MRT6_DLOG(DEBUG_ANY, "%s: socket %p", __func__, so);
 
 	if (v != 1)
 		return (ENOPROTOOPT);
 
 	MROUTER6_LOCK();
 
 	if (V_ip6_mrouter != NULL) {
 		MROUTER6_UNLOCK();
 		return (EADDRINUSE);
 	}
 
 	V_ip6_mrouter = so;
 	V_ip6_mrouter_ver = cmd;
 
 	bzero((caddr_t)mf6ctable, sizeof(mf6ctable));
 	bzero((caddr_t)n6expire, sizeof(n6expire));
 
 	V_pim6 = 0;/* used for stubbing out/in pim stuff */
 
 	callout_init_mtx(&expire_upcalls_ch, MFC6_LOCKPTR(), 0);
 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 	    expire_upcalls, NULL);
 
 	MROUTER6_UNLOCK();
 
 	MRT6_DLOG(DEBUG_ANY, "finished");
 
 	return (0);
 }
 
 /*
  * Disable IPv6 multicast forwarding.
  */
 int
 X_ip6_mrouter_done(void)
 {
 	mifi_t mifi;
 	u_long i;
 	struct mf6c *rt;
 	struct rtdetq *rte;
 
 	MROUTER6_LOCK();
 
 	if (V_ip6_mrouter == NULL) {
 		MROUTER6_UNLOCK();
 		return (EINVAL);
 	}
 
 	/*
 	 * For each phyint in use, disable promiscuous reception of all IPv6
 	 * multicasts.
 	 */
 	for (mifi = 0; mifi < nummifs; mifi++) {
 		if (mif6table[mifi].m6_ifp &&
 		    !(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
 			if_allmulti(mif6table[mifi].m6_ifp, 0);
 		}
 	}
 	bzero((caddr_t)mif6table, sizeof(mif6table));
 	nummifs = 0;
 
 	V_pim6 = 0; /* used to stub out/in pim specific code */
 
 	/*
 	 * Free all multicast forwarding cache entries.
 	 */
 	MFC6_LOCK();
 	for (i = 0; i < MF6CTBLSIZ; i++) {
 		rt = mf6ctable[i];
 		while (rt) {
 			struct mf6c *frt;
 
 			for (rte = rt->mf6c_stall; rte != NULL; ) {
 				struct rtdetq *n = rte->next;
 
 				m_freem(rte->m);
 				free(rte, M_MRTABLE6);
 				rte = n;
 			}
 			frt = rt;
 			rt = rt->mf6c_next;
 			free(frt, M_MRTABLE6);
 		}
 	}
 	bzero((caddr_t)mf6ctable, sizeof(mf6ctable));
 	MFC6_UNLOCK();
 
 	callout_drain(&expire_upcalls_ch);
 
 	/*
 	 * Reset register interface
 	 */
 	if (reg_mif_num != (mifi_t)-1 && multicast_register_if6 != NULL) {
 		if_detach(multicast_register_if6);
 		if_free(multicast_register_if6);
 		reg_mif_num = (mifi_t)-1;
 		multicast_register_if6 = NULL;
 	}
 
 	V_ip6_mrouter = NULL;
 	V_ip6_mrouter_ver = 0;
 
 	MROUTER6_UNLOCK();
 	MRT6_DLOG(DEBUG_ANY, "finished");
 
 	return (0);
 }
 
 static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 };
 
 /*
  * Add a mif to the mif table
  */
 static int
 add_m6if(struct mif6ctl *mifcp)
 {
 	struct epoch_tracker et;
 	struct mif6 *mifp;
 	struct ifnet *ifp;
 	int error;
 
 	MIF6_LOCK();
 
 	if (mifcp->mif6c_mifi >= MAXMIFS) {
 		MIF6_UNLOCK();
 		return (EINVAL);
 	}
 	mifp = mif6table + mifcp->mif6c_mifi;
 	if (mifp->m6_ifp != NULL) {
 		MIF6_UNLOCK();
 		return (EADDRINUSE); /* XXX: is it appropriate? */
 	}
 
 	NET_EPOCH_ENTER(et);
 	if ((ifp = ifnet_byindex(mifcp->mif6c_pifi)) == NULL) {
 		NET_EPOCH_EXIT(et);
 		MIF6_UNLOCK();
 		return (ENXIO);
 	}
 	NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 
 	if (mifcp->mif6c_flags & MIFF_REGISTER) {
 		if (reg_mif_num == (mifi_t)-1) {
 			ifp = if_alloc(IFT_OTHER);
 
 			if_initname(ifp, "register_mif", 0);
 			ifp->if_flags |= IFF_LOOPBACK;
 			if_attach(ifp);
 			multicast_register_if6 = ifp;
 			reg_mif_num = mifcp->mif6c_mifi;
 			/*
 			 * it is impossible to guess the ifindex of the
 			 * register interface.  So mif6c_pifi is automatically
 			 * calculated.
 			 */
 			mifcp->mif6c_pifi = ifp->if_index;
 		} else {
 			ifp = multicast_register_if6;
 		}
 	} else {
 		/* Make sure the interface supports multicast */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 			MIF6_UNLOCK();
 			return (EOPNOTSUPP);
 		}
 
 		error = if_allmulti(ifp, 1);
 		if (error) {
 			MIF6_UNLOCK();
 			return (error);
 		}
 	}
 
 	mifp->m6_flags     = mifcp->mif6c_flags;
 	mifp->m6_ifp       = ifp;
 
 	/* initialize per mif pkt counters */
 	mifp->m6_pkt_in    = 0;
 	mifp->m6_pkt_out   = 0;
 	mifp->m6_bytes_in  = 0;
 	mifp->m6_bytes_out = 0;
 
 	/* Adjust nummifs up if the mifi is higher than nummifs */
 	if (nummifs <= mifcp->mif6c_mifi)
 		nummifs = mifcp->mif6c_mifi + 1;
 
 	MIF6_UNLOCK();
 	MRT6_DLOG(DEBUG_ANY, "mif #%d, phyint %s", mifcp->mif6c_mifi,
 	    if_name(ifp));
 
 	return (0);
 }
 
 /*
  * Delete a mif from the mif table
  */
 static int
 del_m6if_locked(mifi_t *mifip)
 {
 	struct mif6 *mifp = mif6table + *mifip;
 	mifi_t mifi;
 	struct ifnet *ifp;
 
 	MIF6_LOCK_ASSERT();
 
 	if (*mifip >= nummifs)
 		return (EINVAL);
 	if (mifp->m6_ifp == NULL)
 		return (EINVAL);
 
 	if (!(mifp->m6_flags & MIFF_REGISTER)) {
 		/* XXX: TODO: Maintain an ALLMULTI refcount in struct ifnet. */
 		ifp = mifp->m6_ifp;
 		if_allmulti(ifp, 0);
 	} else {
 		if (reg_mif_num != (mifi_t)-1 &&
 		    multicast_register_if6 != NULL) {
 			if_detach(multicast_register_if6);
 			if_free(multicast_register_if6);
 			reg_mif_num = (mifi_t)-1;
 			multicast_register_if6 = NULL;
 		}
 	}
 
 	bzero((caddr_t)mifp, sizeof(*mifp));
 
 	/* Adjust nummifs down */
 	for (mifi = nummifs; mifi > 0; mifi--)
 		if (mif6table[mifi - 1].m6_ifp)
 			break;
 	nummifs = mifi;
 	MRT6_DLOG(DEBUG_ANY, "mif %d, nummifs %d", *mifip, nummifs);
 
 	return (0);
 }
 
 static int
 del_m6if(mifi_t *mifip)
 {
 	int cc;
 
 	MIF6_LOCK();
 	cc = del_m6if_locked(mifip);
 	MIF6_UNLOCK();
 
 	return (cc);
 }
 
 /*
  * Add an mfc entry
  */
 static int
 add_m6fc(struct mf6cctl *mfccp)
 {
 	struct mf6c *rt;
 	u_long hash;
 	struct rtdetq *rte;
 	u_short nstl;
 	char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN];
 
 	MFC6_LOCK();
 
 	MF6CFIND(mfccp->mf6cc_origin.sin6_addr,
 		 mfccp->mf6cc_mcastgrp.sin6_addr, rt);
 
 	/* If an entry already exists, just update the fields */
 	if (rt) {
 		MRT6_DLOG(DEBUG_MFC, "no upcall o %s g %s p %x",
 		    ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr),
 		    ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr),
 		    mfccp->mf6cc_parent);
 
 		rt->mf6c_parent = mfccp->mf6cc_parent;
 		rt->mf6c_ifset = mfccp->mf6cc_ifset;
 
 		MFC6_UNLOCK();
 		return (0);
 	}
 
 	/*
 	 * Find the entry for which the upcall was made and update
 	 */
 	hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr,
 			mfccp->mf6cc_mcastgrp.sin6_addr);
 	for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) {
 		if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
 				       &mfccp->mf6cc_origin.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
 				       &mfccp->mf6cc_mcastgrp.sin6_addr) &&
 		    (rt->mf6c_stall != NULL)) {
 			if (nstl++)
 				log(LOG_ERR,
 				    "add_m6fc: %s o %s g %s p %x dbx %p\n",
 				    "multiple kernel entries",
 				    ip6_sprintf(ip6bufo,
 					    &mfccp->mf6cc_origin.sin6_addr),
 				    ip6_sprintf(ip6bufg,
 					    &mfccp->mf6cc_mcastgrp.sin6_addr),
 				    mfccp->mf6cc_parent, rt->mf6c_stall);
 
 			MRT6_DLOG(DEBUG_MFC, "o %s g %s p %x dbg %p",
 			    ip6_sprintf(ip6bufo,
 			    &mfccp->mf6cc_origin.sin6_addr),
 			    ip6_sprintf(ip6bufg,
 				&mfccp->mf6cc_mcastgrp.sin6_addr),
 			    mfccp->mf6cc_parent, rt->mf6c_stall);
 
 			rt->mf6c_origin     = mfccp->mf6cc_origin;
 			rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 			rt->mf6c_parent     = mfccp->mf6cc_parent;
 			rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 			/* initialize pkt counters per src-grp */
 			rt->mf6c_pkt_cnt    = 0;
 			rt->mf6c_byte_cnt   = 0;
 			rt->mf6c_wrong_if   = 0;
 
 			rt->mf6c_expire = 0;	/* Don't clean this guy up */
 			n6expire[hash]--;
 
 			/* free packets Qed at the end of this entry */
 			for (rte = rt->mf6c_stall; rte != NULL; ) {
 				struct rtdetq *n = rte->next;
 				ip6_mdq(rte->m, rte->ifp, rt);
 				m_freem(rte->m);
 #ifdef UPCALL_TIMING
 				collate(&(rte->t));
 #endif /* UPCALL_TIMING */
 				free(rte, M_MRTABLE6);
 				rte = n;
 			}
 			rt->mf6c_stall = NULL;
 		}
 	}
 
 	/*
 	 * It is possible that an entry is being inserted without an upcall
 	 */
 	if (nstl == 0) {
 		MRT6_DLOG(DEBUG_MFC, "no upcall h %lu o %s g %s p %x", hash,
 		    ip6_sprintf(ip6bufo, &mfccp->mf6cc_origin.sin6_addr),
 		    ip6_sprintf(ip6bufg, &mfccp->mf6cc_mcastgrp.sin6_addr),
 		    mfccp->mf6cc_parent);
 
 		for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
 			if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr,
 					       &mfccp->mf6cc_origin.sin6_addr)&&
 			    IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
 					       &mfccp->mf6cc_mcastgrp.sin6_addr)) {
 				rt->mf6c_origin     = mfccp->mf6cc_origin;
 				rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 				rt->mf6c_parent     = mfccp->mf6cc_parent;
 				rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 				/* initialize pkt counters per src-grp */
 				rt->mf6c_pkt_cnt    = 0;
 				rt->mf6c_byte_cnt   = 0;
 				rt->mf6c_wrong_if   = 0;
 
 				if (rt->mf6c_expire)
 					n6expire[hash]--;
 				rt->mf6c_expire	   = 0;
 			}
 		}
 		if (rt == NULL) {
 			/* no upcall, so make a new entry */
 			rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6,
 						  M_NOWAIT);
 			if (rt == NULL) {
 				MFC6_UNLOCK();
 				return (ENOBUFS);
 			}
 
 			/* insert new entry at head of hash chain */
 			rt->mf6c_origin     = mfccp->mf6cc_origin;
 			rt->mf6c_mcastgrp   = mfccp->mf6cc_mcastgrp;
 			rt->mf6c_parent     = mfccp->mf6cc_parent;
 			rt->mf6c_ifset	    = mfccp->mf6cc_ifset;
 			/* initialize pkt counters per src-grp */
 			rt->mf6c_pkt_cnt    = 0;
 			rt->mf6c_byte_cnt   = 0;
 			rt->mf6c_wrong_if   = 0;
 			rt->mf6c_expire     = 0;
 			rt->mf6c_stall = NULL;
 
 			/* link into table */
 			rt->mf6c_next  = mf6ctable[hash];
 			mf6ctable[hash] = rt;
 		}
 	}
 
 	MFC6_UNLOCK();
 	return (0);
 }
 
 #ifdef UPCALL_TIMING
 /*
  * collect delay statistics on the upcalls
  */
 static void
 collate(struct timeval *t)
 {
 	u_long d;
 	struct timeval tp;
 	u_long delta;
 
 	GET_TIME(tp);
 
 	if (TV_LT(*t, tp))
 	{
 		TV_DELTA(tp, *t, delta);
 
 		d = delta >> 10;
 		if (d > UPCALL_MAX)
 			d = UPCALL_MAX;
 
 		++upcall_data[d];
 	}
 }
 #endif /* UPCALL_TIMING */
 
 /*
  * Delete an mfc entry
  */
 static int
 del_m6fc(struct mf6cctl *mfccp)
 {
 #ifdef MRT6DEBUG
 	char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN];
 #endif
 	struct sockaddr_in6	origin;
 	struct sockaddr_in6	mcastgrp;
 	struct mf6c		*rt;
 	struct mf6c		**nptr;
 	u_long		hash;
 
 	origin = mfccp->mf6cc_origin;
 	mcastgrp = mfccp->mf6cc_mcastgrp;
 	hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr);
 
 	MRT6_DLOG(DEBUG_MFC, "orig %s mcastgrp %s",
 	    ip6_sprintf(ip6bufo, &origin.sin6_addr),
 	    ip6_sprintf(ip6bufg, &mcastgrp.sin6_addr));
 
 	MFC6_LOCK();
 
 	nptr = &mf6ctable[hash];
 	while ((rt = *nptr) != NULL) {
 		if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr,
 				       &rt->mf6c_origin.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr,
 				       &rt->mf6c_mcastgrp.sin6_addr) &&
 		    rt->mf6c_stall == NULL)
 			break;
 
 		nptr = &rt->mf6c_next;
 	}
 	if (rt == NULL) {
 		MFC6_UNLOCK();
 		return (EADDRNOTAVAIL);
 	}
 
 	*nptr = rt->mf6c_next;
 	free(rt, M_MRTABLE6);
 
 	MFC6_UNLOCK();
 
 	return (0);
 }
 
 static int
 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in6 *src)
 {
 
 	if (s) {
 		if (sbappendaddr(&s->so_rcv,
 				 (struct sockaddr *)src,
 				 mm, (struct mbuf *)0) != 0) {
 			sorwakeup(s);
 			return (0);
 		} else
 			soroverflow(s);
 	}
 	m_freem(mm);
 	return (-1);
 }
 
 /*
  * IPv6 multicast forwarding function. This function assumes that the packet
  * pointed to by "ip6" has arrived on (or is about to be sent to) the interface
  * pointed to by "ifp", and the packet is to be relayed to other networks
  * that have members of the packet's destination IPv6 multicast group.
  *
  * The packet is returned unscathed to the caller, unless it is
  * erroneous, in which case a non-zero return value tells the caller to
  * discard it.
  *
  * NOTE: this implementation assumes that m->m_pkthdr.rcvif is NULL iff
  * this function is called in the originating context (i.e., not when
  * forwarding a packet from other node).  ip6_output(), which is currently the
  * only function that calls this function is called in the originating context,
  * explicitly ensures this condition.  It is caller's responsibility to ensure
  * that if this function is called from somewhere else in the originating
  * context in the future.
  */
 int
 X_ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m)
 {
 	struct rtdetq *rte;
 	struct mbuf *mb0;
 	struct mf6c *rt;
 	struct mif6 *mifp;
 	struct mbuf *mm;
 	u_long hash;
 	mifi_t mifi;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 #ifdef UPCALL_TIMING
 	struct timeval tp;
 
 	GET_TIME(tp);
 #endif /* UPCALL_TIMING */
 
 	MRT6_DLOG(DEBUG_FORWARD, "src %s, dst %s, ifindex %d",
 	    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 	    ip6_sprintf(ip6bufd, &ip6->ip6_dst), ifp->if_index);
 
 	/*
 	 * Don't forward a packet with Hop limit of zero or one,
 	 * or a packet destined to a local-only group.
 	 */
 	if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) ||
 	    IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
 		return (0);
 	ip6->ip6_hlim--;
 
 	/*
 	 * Source address check: do not forward packets with unspecified
 	 * source. It was discussed in July 2000, on ipngwg mailing list.
 	 * This is rather more serious than unicast cases, because some
 	 * MLD packets can be sent with the unspecified source address
 	 * (although such packets must normally set 1 to the hop limit field).
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 		IP6STAT_INC(ip6s_cantforward);
 		if (V_ip6_log_time + V_ip6_log_interval < time_uptime) {
 			V_ip6_log_time = time_uptime;
 			log(LOG_DEBUG,
 			    "cannot forward "
 			    "from %s to %s nxt %d received on %s\n",
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 			    ip6->ip6_nxt,
 			    if_name(m->m_pkthdr.rcvif));
 		}
 		return (0);
 	}
 
 	MFC6_LOCK();
 
 	/*
 	 * Determine forwarding mifs from the forwarding cache table
 	 */
 	MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt);
 	MRT6STAT_INC(mrt6s_mfc_lookups);
 
 	/* Entry exists, so forward if necessary */
 	if (rt) {
 		MFC6_UNLOCK();
 		return (ip6_mdq(m, ifp, rt));
 	}
 
 	/*
 	 * If we don't have a route for packet's origin,
 	 * Make a copy of the packet & send message to routing daemon.
 	 */
 	MRT6STAT_INC(mrt6s_no_route);
 	MRT6_DLOG(DEBUG_FORWARD | DEBUG_MFC, "no rte s %s g %s",
 	    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 	    ip6_sprintf(ip6bufd, &ip6->ip6_dst));
 
 	/*
 	 * Allocate mbufs early so that we don't do extra work if we
 	 * are just going to fail anyway.
 	 */
 	rte = (struct rtdetq *)malloc(sizeof(*rte), M_MRTABLE6, M_NOWAIT);
 	if (rte == NULL) {
 		MFC6_UNLOCK();
 		return (ENOBUFS);
 	}
 	mb0 = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 	/*
 	 * Pullup packet header if needed before storing it,
 	 * as other references may modify it in the meantime.
 	 */
 	if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < sizeof(struct ip6_hdr)))
 		mb0 = m_pullup(mb0, sizeof(struct ip6_hdr));
 	if (mb0 == NULL) {
 		free(rte, M_MRTABLE6);
 		MFC6_UNLOCK();
 		return (ENOBUFS);
 	}
 
 	/* is there an upcall waiting for this packet? */
 	hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst);
 	for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
 		if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
 		    &rt->mf6c_origin.sin6_addr) &&
 		    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
 		    &rt->mf6c_mcastgrp.sin6_addr) && (rt->mf6c_stall != NULL))
 			break;
 	}
 
 	if (rt == NULL) {
 		struct mrt6msg *im;
 #ifdef MRT6_OINIT
 		struct omrt6msg *oim;
 #endif
 		/* no upcall, so make a new entry */
 		rt = (struct mf6c *)malloc(sizeof(*rt), M_MRTABLE6, M_NOWAIT);
 		if (rt == NULL) {
 			free(rte, M_MRTABLE6);
 			m_freem(mb0);
 			MFC6_UNLOCK();
 			return (ENOBUFS);
 		}
 		/*
 		 * Make a copy of the header to send to the user
 		 * level process
 		 */
 		mm = m_copym(mb0, 0, sizeof(struct ip6_hdr), M_NOWAIT);
 		if (mm == NULL) {
 			free(rte, M_MRTABLE6);
 			m_freem(mb0);
 			free(rt, M_MRTABLE6);
 			MFC6_UNLOCK();
 			return (ENOBUFS);
 		}
 
 		/*
 		 * Send message to routing daemon
 		 */
 		sin6.sin6_addr = ip6->ip6_src;
 		im = NULL;
 #ifdef MRT6_OINIT
 		oim = NULL;
 #endif
 		switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 		case MRT6_OINIT:
 			oim = mtod(mm, struct omrt6msg *);
 			oim->im6_msgtype = MRT6MSG_NOCACHE;
 			oim->im6_mbz = 0;
 			break;
 #endif
 		case MRT6_INIT:
 			im = mtod(mm, struct mrt6msg *);
 			im->im6_msgtype = MRT6MSG_NOCACHE;
 			im->im6_mbz = 0;
 			break;
 		default:
 			free(rte, M_MRTABLE6);
 			m_freem(mb0);
 			free(rt, M_MRTABLE6);
 			MFC6_UNLOCK();
 			return (EINVAL);
 		}
 
 		MRT6_DLOG(DEBUG_FORWARD, "getting the iif info in the kernel");
 		for (mifp = mif6table, mifi = 0;
 		    mifi < nummifs && mifp->m6_ifp != ifp; mifp++, mifi++)
 				;
 
 		switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 		case MRT6_OINIT:
 			oim->im6_mif = mifi;
 			break;
 #endif
 		case MRT6_INIT:
 			im->im6_mif = mifi;
 			break;
 		}
 
 		if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) {
 			log(LOG_WARNING, "ip6_mforward: ip6_mrouter "
 			    "socket queue full\n");
 			MRT6STAT_INC(mrt6s_upq_sockfull);
 			free(rte, M_MRTABLE6);
 			m_freem(mb0);
 			free(rt, M_MRTABLE6);
 			MFC6_UNLOCK();
 			return (ENOBUFS);
 		}
 
 		MRT6STAT_INC(mrt6s_upcalls);
 
 		/* insert new entry at head of hash chain */
 		bzero(rt, sizeof(*rt));
 		rt->mf6c_origin.sin6_family = AF_INET6;
 		rt->mf6c_origin.sin6_len = sizeof(struct sockaddr_in6);
 		rt->mf6c_origin.sin6_addr = ip6->ip6_src;
 		rt->mf6c_mcastgrp.sin6_family = AF_INET6;
 		rt->mf6c_mcastgrp.sin6_len = sizeof(struct sockaddr_in6);
 		rt->mf6c_mcastgrp.sin6_addr = ip6->ip6_dst;
 		rt->mf6c_expire = UPCALL_EXPIRE;
 		n6expire[hash]++;
 		rt->mf6c_parent = MF6C_INCOMPLETE_PARENT;
 
 		/* link into table */
 		rt->mf6c_next  = mf6ctable[hash];
 		mf6ctable[hash] = rt;
 		/* Add this entry to the end of the queue */
 		rt->mf6c_stall = rte;
 	} else {
 		/* determine if q has overflowed */
 		struct rtdetq **p;
 		int npkts = 0;
 
 		for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next)
 			if (++npkts > MAX_UPQ6) {
 				MRT6STAT_INC(mrt6s_upq_ovflw);
 				free(rte, M_MRTABLE6);
 				m_freem(mb0);
 				MFC6_UNLOCK();
 				return (0);
 			}
 
 		/* Add this entry to the end of the queue */
 		*p = rte;
 	}
 
 	rte->next = NULL;
 	rte->m = mb0;
 	rte->ifp = ifp;
 #ifdef UPCALL_TIMING
 	rte->t = tp;
 #endif /* UPCALL_TIMING */
 
 	MFC6_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Clean up cache entries if upcalls are not serviced
  * Call from the Slow Timeout mechanism, every half second.
  */
 static void
 expire_upcalls(void *unused)
 {
 #ifdef MRT6DEBUG
 	char ip6bufo[INET6_ADDRSTRLEN], ip6bufg[INET6_ADDRSTRLEN];
 #endif
 	struct rtdetq *rte;
 	struct mf6c *mfc, **nptr;
 	u_long i;
 
 	MFC6_LOCK_ASSERT();
 
 	for (i = 0; i < MF6CTBLSIZ; i++) {
 		if (n6expire[i] == 0)
 			continue;
 		nptr = &mf6ctable[i];
 		while ((mfc = *nptr) != NULL) {
 			rte = mfc->mf6c_stall;
 			/*
 			 * Skip real cache entries
 			 * Make sure it wasn't marked to not expire (shouldn't happen)
 			 * If it expires now
 			 */
 			if (rte != NULL &&
 			    mfc->mf6c_expire != 0 &&
 			    --mfc->mf6c_expire == 0) {
 				MRT6_DLOG(DEBUG_EXPIRE, "expiring (%s %s)",
 				    ip6_sprintf(ip6bufo, &mfc->mf6c_origin.sin6_addr),
 				    ip6_sprintf(ip6bufg, &mfc->mf6c_mcastgrp.sin6_addr));
 				/*
 				 * drop all the packets
 				 * free the mbuf with the pkt, if, timing info
 				 */
 				do {
 					struct rtdetq *n = rte->next;
 					m_freem(rte->m);
 					free(rte, M_MRTABLE6);
 					rte = n;
 				} while (rte != NULL);
 				MRT6STAT_INC(mrt6s_cache_cleanups);
 				n6expire[i]--;
 
 				*nptr = mfc->mf6c_next;
 				free(mfc, M_MRTABLE6);
 			} else {
 				nptr = &mfc->mf6c_next;
 			}
 		}
 	}
 	callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
 	    expire_upcalls, NULL);
 }
 
 /*
  * Packet forwarding routine once entry in the cache is made
  */
 static int
 ip6_mdq(struct mbuf *m, struct ifnet *ifp, struct mf6c *rt)
 {
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	mifi_t mifi, iif;
 	struct mif6 *mifp;
 	int plen = m->m_pkthdr.len;
 	struct in6_addr src0, dst0; /* copies for local work */
 	u_int32_t iszone, idzone, oszone, odzone;
 	int error = 0;
 
 	/*
 	 * Don't forward if it didn't arrive from the parent mif
 	 * for its origin.
 	 */
 	mifi = rt->mf6c_parent;
 	if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) {
 		/* came in the wrong interface */
 		MRT6_DLOG(DEBUG_FORWARD,
 		    "wrong if: ifid %d mifi %d mififid %x", ifp->if_index,
 		    mifi, mif6table[mifi].m6_ifp->if_index);
 		MRT6STAT_INC(mrt6s_wrong_if);
 		rt->mf6c_wrong_if++;
 		/*
 		 * If we are doing PIM processing, and we are forwarding
 		 * packets on this interface, send a message to the
 		 * routing daemon.
 		 */
 		/* have to make sure this is a valid mif */
 		if (mifi < nummifs && mif6table[mifi].m6_ifp)
 			if (V_pim6 && (m->m_flags & M_LOOP) == 0) {
 				/*
 				 * Check the M_LOOP flag to avoid an
 				 * unnecessary PIM assert.
 				 * XXX: M_LOOP is an ad-hoc hack...
 				 */
 				static struct sockaddr_in6 sin6 =
 				{ sizeof(sin6), AF_INET6 };
 
 				struct mbuf *mm;
 				struct mrt6msg *im;
 #ifdef MRT6_OINIT
 				struct omrt6msg *oim;
 #endif
 
 				mm = m_copym(m, 0, sizeof(struct ip6_hdr),
 				    M_NOWAIT);
 				if (mm &&
 				    (!M_WRITABLE(mm) ||
 				     mm->m_len < sizeof(struct ip6_hdr)))
 					mm = m_pullup(mm, sizeof(struct ip6_hdr));
 				if (mm == NULL)
 					return (ENOBUFS);
 
 #ifdef MRT6_OINIT
 				oim = NULL;
 #endif
 				im = NULL;
 				switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 				case MRT6_OINIT:
 					oim = mtod(mm, struct omrt6msg *);
 					oim->im6_msgtype = MRT6MSG_WRONGMIF;
 					oim->im6_mbz = 0;
 					break;
 #endif
 				case MRT6_INIT:
 					im = mtod(mm, struct mrt6msg *);
 					im->im6_msgtype = MRT6MSG_WRONGMIF;
 					im->im6_mbz = 0;
 					break;
 				default:
 					m_freem(mm);
 					return (EINVAL);
 				}
 
 				for (mifp = mif6table, iif = 0;
 				     iif < nummifs && mifp &&
 					     mifp->m6_ifp != ifp;
 				     mifp++, iif++)
 					;
 
 				switch (V_ip6_mrouter_ver) {
 #ifdef MRT6_OINIT
 				case MRT6_OINIT:
 					oim->im6_mif = iif;
 					sin6.sin6_addr = oim->im6_src;
 					break;
 #endif
 				case MRT6_INIT:
 					im->im6_mif = iif;
 					sin6.sin6_addr = im->im6_src;
 					break;
 				}
 
 				MRT6STAT_INC(mrt6s_upcalls);
 
 				if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) {
 					MRT6_DLOG(DEBUG_ANY,
 					    "ip6_mrouter socket queue full");
 					MRT6STAT_INC(mrt6s_upq_sockfull);
 					return (ENOBUFS);
 				}	/* if socket Q full */
 			}		/* if PIM */
 		return (0);
 	}			/* if wrong iif */
 
 	/* If I sourced this packet, it counts as output, else it was input. */
 	if (m->m_pkthdr.rcvif == NULL) {
 		/* XXX: is rcvif really NULL when output?? */
 		mif6table[mifi].m6_pkt_out++;
 		mif6table[mifi].m6_bytes_out += plen;
 	} else {
 		mif6table[mifi].m6_pkt_in++;
 		mif6table[mifi].m6_bytes_in += plen;
 	}
 	rt->mf6c_pkt_cnt++;
 	rt->mf6c_byte_cnt += plen;
 
 	/*
 	 * For each mif, forward a copy of the packet if there are group
 	 * members downstream on the interface.
 	 */
 	src0 = ip6->ip6_src;
 	dst0 = ip6->ip6_dst;
 	if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 ||
 	    (error = in6_setscope(&dst0, ifp, &idzone)) != 0) {
 		IP6STAT_INC(ip6s_badscope);
 		return (error);
 	}
 	for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) {
 		if (IF_ISSET(mifi, &rt->mf6c_ifset)) {
 			/*
 			 * check if the outgoing packet is going to break
 			 * a scope boundary.
 			 * XXX For packets through PIM register tunnel
 			 * interface, we believe a routing daemon.
 			 */
 			if (!(mif6table[rt->mf6c_parent].m6_flags &
 			      MIFF_REGISTER) &&
 			    !(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
 				if (in6_setscope(&src0, mif6table[mifi].m6_ifp,
 				    &oszone) ||
 				    in6_setscope(&dst0, mif6table[mifi].m6_ifp,
 				    &odzone) ||
 				    iszone != oszone ||
 				    idzone != odzone) {
 					IP6STAT_INC(ip6s_badscope);
 					continue;
 				}
 			}
 
 			mifp->m6_pkt_out++;
 			mifp->m6_bytes_out += plen;
 			if (mifp->m6_flags & MIFF_REGISTER)
 				register_send(ip6, mifp, m);
 			else
 				phyint_send(ip6, mifp, m);
 		}
 	}
 	return (0);
 }
 
 static void
 phyint_send(struct ip6_hdr *ip6, struct mif6 *mifp, struct mbuf *m)
 {
 #ifdef MRT6DEBUG
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 #endif
 	struct mbuf *mb_copy;
 	struct ifnet *ifp = mifp->m6_ifp;
 	int error __unused = 0;
 	u_long linkmtu;
 
 	/*
 	 * Make a new reference to the packet; make sure that
 	 * the IPv6 header is actually copied, not just referenced,
 	 * so that ip6_output() only scribbles on the copy.
 	 */
 	mb_copy = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 	if (mb_copy &&
 	    (!M_WRITABLE(mb_copy) || mb_copy->m_len < sizeof(struct ip6_hdr)))
 		mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr));
 	if (mb_copy == NULL) {
 		return;
 	}
 	/* set MCAST flag to the outgoing packet */
 	mb_copy->m_flags |= M_MCAST;
 
 	/*
 	 * If we sourced the packet, call ip6_output since we may devide
 	 * the packet into fragments when the packet is too big for the
 	 * outgoing interface.
 	 * Otherwise, we can simply send the packet to the interface
 	 * sending queue.
 	 */
 	if (m->m_pkthdr.rcvif == NULL) {
 		struct ip6_moptions im6o;
 		struct epoch_tracker et;
 
 		im6o.im6o_multicast_ifp = ifp;
 		/* XXX: ip6_output will override ip6->ip6_hlim */
 		im6o.im6o_multicast_hlim = ip6->ip6_hlim;
 		im6o.im6o_multicast_loop = 1;
 		NET_EPOCH_ENTER(et);
 		error = ip6_output(mb_copy, NULL, NULL, IPV6_FORWARDING, &im6o,
 		    NULL, NULL);
 		NET_EPOCH_EXIT(et);
 
 		MRT6_DLOG(DEBUG_XMIT, "mif %u err %d",
 		    (uint16_t)(mifp - mif6table), error);
 		return;
 	}
 
 	/*
 	 * If configured to loop back multicasts by default,
 	 * loop back a copy now.
 	 */
 	if (in6_mcast_loop)
 		ip6_mloopback(ifp, m);
 
 	/*
 	 * Put the packet into the sending queue of the outgoing interface
 	 * if it would fit in the MTU of the interface.
 	 */
 	linkmtu = IN6_LINKMTU(ifp);
 	if (mb_copy->m_pkthdr.len <= linkmtu || linkmtu < IPV6_MMTU) {
 		struct sockaddr_in6 dst6;
 
 		bzero(&dst6, sizeof(dst6));
 		dst6.sin6_len = sizeof(struct sockaddr_in6);
 		dst6.sin6_family = AF_INET6;
 		dst6.sin6_addr = ip6->ip6_dst;
 
 		IP_PROBE(send, NULL, NULL, ip6, ifp, NULL, ip6);
 		/*
 		 * We just call if_output instead of nd6_output here, since
 		 * we need no ND for a multicast forwarded packet...right?
 		 */
 		m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 		error = (*ifp->if_output)(ifp, mb_copy,
 		    (struct sockaddr *)&dst6, NULL);
 		MRT6_DLOG(DEBUG_XMIT, "mif %u err %d",
 		    (uint16_t)(mifp - mif6table), error);
 	} else {
 		/*
 		 * pMTU discovery is intentionally disabled by default, since
 		 * various router may notify pMTU in multicast, which can be
 		 * a DDoS to a router
 		 */
 		if (V_ip6_mcast_pmtu)
 			icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0, linkmtu);
 		else {
 			MRT6_DLOG(DEBUG_XMIT, " packet too big on %s o %s "
 			    "g %s size %d (discarded)", if_name(ifp),
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 			    mb_copy->m_pkthdr.len);
 			m_freem(mb_copy); /* simply discard the packet */
 		}
 	}
 }
 
 static int
 register_send(struct ip6_hdr *ip6, struct mif6 *mif, struct mbuf *m)
 {
 #ifdef MRT6DEBUG
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 #endif
 	struct mbuf *mm;
 	int i, len = m->m_pkthdr.len;
 	static struct sockaddr_in6 sin6 = { sizeof(sin6), AF_INET6 };
 	struct mrt6msg *im6;
 
 	MRT6_DLOG(DEBUG_ANY, "src %s dst %s",
 	    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 	    ip6_sprintf(ip6bufd, &ip6->ip6_dst));
 	PIM6STAT_INC(pim6s_snd_registers);
 
 	/* Make a copy of the packet to send to the user level process. */
 	mm = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mm == NULL)
 		return (ENOBUFS);
 	mm->m_data += max_linkhdr;
 	mm->m_len = sizeof(struct ip6_hdr);
 
 	if ((mm->m_next = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) {
 		m_freem(mm);
 		return (ENOBUFS);
 	}
 	i = MHLEN - M_LEADINGSPACE(mm);
 	if (i > len)
 		i = len;
 	mm = m_pullup(mm, i);
 	if (mm == NULL)
 		return (ENOBUFS);
 /* TODO: check it! */
 	mm->m_pkthdr.len = len + sizeof(struct ip6_hdr);
 
 	/*
 	 * Send message to routing daemon
 	 */
 	sin6.sin6_addr = ip6->ip6_src;
 
 	im6 = mtod(mm, struct mrt6msg *);
 	im6->im6_msgtype      = MRT6MSG_WHOLEPKT;
 	im6->im6_mbz          = 0;
 
 	im6->im6_mif = mif - mif6table;
 
 	/* iif info is not given for reg. encap.n */
 	MRT6STAT_INC(mrt6s_upcalls);
 
 	if (socket_send(V_ip6_mrouter, mm, &sin6) < 0) {
 		MRT6_DLOG(DEBUG_ANY, "ip6_mrouter socket queue full");
 		MRT6STAT_INC(mrt6s_upq_sockfull);
 		return (ENOBUFS);
 	}
 	return (0);
 }
 
 /*
  * pim6_encapcheck() is called by the encap6_input() path at runtime to
  * determine if a packet is for PIM; allowing PIM to be dynamically loaded
  * into the kernel.
  */
 static int
 pim6_encapcheck(const struct mbuf *m __unused, int off __unused,
     int proto __unused, void *arg __unused)
 {
 
     KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
     return (8);		/* claim the datagram. */
 }
 
 /*
  * PIM sparse mode hook
  * Receives the pim control messages, and passes them up to the listening
  * socket, using rip6_input.
  * The only message processed is the REGISTER pim message; the pim header
  * is stripped off, and the inner packet is passed to register_mforward.
  */
 static int
 pim6_input(struct mbuf *m, int off, int proto, void *arg __unused)
 {
 	struct pim *pim; /* pointer to a pim struct */
 	struct ip6_hdr *ip6;
 	int pimlen;
 	int minlen;
 
 	PIM6STAT_INC(pim6s_rcv_total);
 
 	/*
 	 * Validate lengths
 	 */
 	pimlen = m->m_pkthdr.len - off;
 	if (pimlen < PIM_MINLEN) {
 		PIM6STAT_INC(pim6s_rcv_tooshort);
 		MRT6_DLOG(DEBUG_PIM, "PIM packet too short");
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * if the packet is at least as big as a REGISTER, go ahead
 	 * and grab the PIM REGISTER header size, to avoid another
 	 * possible m_pullup() later.
 	 *
 	 * PIM_MINLEN       == pimhdr + u_int32 == 8
 	 * PIM6_REG_MINLEN   == pimhdr + reghdr + eip6hdr == 4 + 4 + 40
 	 */
 	minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN;
 
 	/*
 	 * Make sure that the IP6 and PIM headers in contiguous memory, and
 	 * possibly the PIM REGISTER header
 	 */
 	if (m->m_len < off + minlen) {
 		m = m_pullup(m, off + minlen);
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return (IPPROTO_DONE);
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	pim = (struct pim *)((caddr_t)ip6 + off);
 
 #define PIM6_CHECKSUM
 #ifdef PIM6_CHECKSUM
 	{
 		int cksumlen;
 
 		/*
 		 * Validate checksum.
 		 * If PIM REGISTER, exclude the data packet
 		 */
 		if (pim->pim_type == PIM_REGISTER)
 			cksumlen = PIM_MINLEN;
 		else
 			cksumlen = pimlen;
 
 		if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) {
 			PIM6STAT_INC(pim6s_rcv_badsum);
 			MRT6_DLOG(DEBUG_PIM, "invalid checksum");
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 #endif /* PIM_CHECKSUM */
 
 	/* PIM version check */
 	if (pim->pim_ver != PIM_VERSION) {
 		PIM6STAT_INC(pim6s_rcv_badversion);
 		MRT6_DLOG(DEBUG_ANY | DEBUG_ERR,
 		    "incorrect version %d, expecting %d",
 		    pim->pim_ver, PIM_VERSION);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	if (pim->pim_type == PIM_REGISTER) {
 		/*
 		 * since this is a REGISTER, we'll make a copy of the register
 		 * headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the
 		 * routing daemon.
 		 */
 		static struct sockaddr_in6 dst = { sizeof(dst), AF_INET6 };
 
 		struct mbuf *mcp;
 		struct ip6_hdr *eip6;
 		u_int32_t *reghdr;
 #ifdef MRT6DEBUG
 		char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 #endif
 
 		PIM6STAT_INC(pim6s_rcv_registers);
 
 		if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) {
 			MRT6_DLOG(DEBUG_PIM, "register mif not set: %d",
 			    reg_mif_num);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		reghdr = (u_int32_t *)(pim + 1);
 
 		if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
 			goto pim6_input_to_daemon;
 
 		/*
 		 * Validate length
 		 */
 		if (pimlen < PIM6_REG_MINLEN) {
 			PIM6STAT_INC(pim6s_rcv_tooshort);
 			PIM6STAT_INC(pim6s_rcv_badregisters);
 			MRT6_DLOG(DEBUG_ANY | DEBUG_ERR, "register packet "
 			    "size too small %d from %s",
 			    pimlen, ip6_sprintf(ip6bufs, &ip6->ip6_src));
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		eip6 = (struct ip6_hdr *) (reghdr + 1);
 		MRT6_DLOG(DEBUG_PIM, "eip6: %s -> %s, eip6 plen %d",
 		    ip6_sprintf(ip6bufs, &eip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &eip6->ip6_dst),
 		    ntohs(eip6->ip6_plen));
 
 		/* verify the version number of the inner packet */
 		if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
 			PIM6STAT_INC(pim6s_rcv_badregisters);
 			MRT6_DLOG(DEBUG_ANY, "invalid IP version (%d) "
 			    "of the inner packet",
 			    (eip6->ip6_vfc & IPV6_VERSION));
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		/* verify the inner packet is destined to a mcast group */
 		if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) {
 			PIM6STAT_INC(pim6s_rcv_badregisters);
 			MRT6_DLOG(DEBUG_PIM, "inner packet of register "
 			    "is not multicast %s",
 			    ip6_sprintf(ip6bufd, &eip6->ip6_dst));
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		/*
 		 * make a copy of the whole header to pass to the daemon later.
 		 */
 		mcp = m_copym(m, 0, off + PIM6_REG_MINLEN, M_NOWAIT);
 		if (mcp == NULL) {
 			MRT6_DLOG(DEBUG_ANY | DEBUG_ERR, "pim register: "
 			    "could not copy register head");
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 
 		/*
 		 * forward the inner ip6 packet; point m_data at the inner ip6.
 		 */
 		m_adj(m, off + PIM_MINLEN);
 		MRT6_DLOG(DEBUG_PIM, "forwarding decapsulated register: "
 		    "src %s, dst %s, mif %d",
 		    ip6_sprintf(ip6bufs, &eip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &eip6->ip6_dst), reg_mif_num);
 
 		if_simloop(mif6table[reg_mif_num].m6_ifp, m,
 				dst.sin6_family, 0);
 
 		/* prepare the register head to send to the mrouting daemon */
 		m = mcp;
 	}
 
 	/*
 	 * Pass the PIM message up to the daemon; if it is a register message
 	 * pass the 'head' only up to the daemon. This includes the
 	 * encapsulator ip6 header, pim header, register header and the
 	 * encapsulated ip6 header.
 	 */
   pim6_input_to_daemon:
 	return (rip6_input(&m, &off, proto));
 }
 
 static int
 ip6_mroute_modevent(module_t mod, int type, void *unused)
 {
 
 	switch (type) {
 	case MOD_LOAD:
 		MROUTER6_LOCK_INIT();
 		MFC6_LOCK_INIT();
 		MIF6_LOCK_INIT();
 
 		pim6_encap_cookie = ip6_encap_attach(&ipv6_encap_cfg,
 		    NULL, M_WAITOK);
 		if (pim6_encap_cookie == NULL) {
 			printf("ip6_mroute: unable to attach pim6 encap\n");
 			MIF6_LOCK_DESTROY();
 			MFC6_LOCK_DESTROY();
 			MROUTER6_LOCK_DESTROY();
 			return (EINVAL);
 		}
 
 		ip6_mforward = X_ip6_mforward;
 		ip6_mrouter_done = X_ip6_mrouter_done;
 		ip6_mrouter_get = X_ip6_mrouter_get;
 		ip6_mrouter_set = X_ip6_mrouter_set;
 		mrt6_ioctl = X_mrt6_ioctl;
 		break;
 
 	case MOD_UNLOAD:
 		if (V_ip6_mrouter != NULL)
 			return EINVAL;
 
 		if (pim6_encap_cookie) {
 			ip6_encap_detach(pim6_encap_cookie);
 			pim6_encap_cookie = NULL;
 		}
 		X_ip6_mrouter_done();
 		ip6_mforward = NULL;
 		ip6_mrouter_done = NULL;
 		ip6_mrouter_get = NULL;
 		ip6_mrouter_set = NULL;
 		mrt6_ioctl = NULL;
 
 		MIF6_LOCK_DESTROY();
 		MFC6_LOCK_DESTROY();
 		MROUTER6_LOCK_DESTROY();
 		break;
 
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t ip6_mroutemod = {
 	"ip6_mroute",
 	ip6_mroute_modevent,
 	0
 };
 
 DECLARE_MODULE(ip6_mroute, ip6_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_ANY);
diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c
index ebb7f7bc6ed1..bce5ed846227 100644
--- a/sys/netinet6/ip6_output.c
+++ b/sys/netinet6/ip6_output.c
@@ -1,3387 +1,3388 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 
 #include <machine/in_cksum.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/ethernet.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/pfil.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_rss.h>
 
 #include <netipsec/ipsec_support.h>
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <netinet6/scope6_var.h>
 
 extern int in6_mcast_loop;
 
 struct ip6_exthdrs {
 	struct mbuf *ip6e_ip6;
 	struct mbuf *ip6e_hbh;
 	struct mbuf *ip6e_dest1;
 	struct mbuf *ip6e_rthdr;
 	struct mbuf *ip6e_dest2;
 };
 
 static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
 
 static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
 			   struct ucred *, int);
 static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *,
 	struct socket *, struct sockopt *);
 static int ip6_getpcbopt(struct inpcb *, int, struct sockopt *);
 static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *,
 	struct ucred *, int, int, int);
 
 static int ip6_copyexthdr(struct mbuf **, caddr_t, int);
 static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
 	struct ip6_frag **);
 static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
 static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
 static int ip6_getpmtu(struct route_in6 *, int,
 	struct ifnet *, const struct in6_addr *, u_long *, int *, u_int,
 	u_int);
 static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long,
 	u_long *, int *, u_int);
 static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *);
 static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
 
 /*
  * Make an extension header from option data.  hp is the source,
  * mp is the destination, and _ol is the optlen.
  */
 #define	MAKE_EXTHDR(hp, mp, _ol)					\
     do {								\
 	if (hp) {							\
 		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
 		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
 		    ((eh)->ip6e_len + 1) << 3);				\
 		if (error)						\
 			goto freehdrs;					\
 		(_ol) += (*(mp))->m_len;				\
 	}								\
     } while (/*CONSTCOND*/ 0)
 
 /*
  * Form a chain of extension headers.
  * m is the extension header mbuf
  * mp is the previous mbuf in the chain
  * p is the next header
  * i is the type of option.
  */
 #define MAKE_CHAIN(m, mp, p, i)\
     do {\
 	if (m) {\
 		if (!hdrsplit) \
 			panic("%s:%d: assumption failed: "\
 			    "hdr not split: hdrsplit %d exthdrs %p",\
 			    __func__, __LINE__, hdrsplit, &exthdrs);\
 		*mtod((m), u_char *) = *(p);\
 		*(p) = (i);\
 		p = mtod((m), u_char *);\
 		(m)->m_next = (mp)->m_next;\
 		(mp)->m_next = (m);\
 		(mp) = (m);\
 	}\
     } while (/*CONSTCOND*/ 0)
 
 void
 in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset)
 {
 	u_short csum;
 
 	csum = in_cksum_skip(m, offset + plen, offset);
 	if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0)
 		csum = 0xffff;
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(csum) > m->m_len)
 		m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
 	else
 		*(u_short *)mtodo(m, offset) = csum;
 }
 
 static void
 ip6_output_delayed_csum(struct mbuf *m, struct ifnet *ifp, int csum_flags,
     int plen, int optlen)
 {
 
 	KASSERT((plen >= optlen), ("%s:%d: plen %d < optlen %d, m %p, ifp %p "
 	    "csum_flags %#x",
 	    __func__, __LINE__, plen, optlen, m, ifp, csum_flags));
 
 	if (csum_flags & CSUM_DELAY_DATA_IPV6) {
 		in6_delayed_cksum(m, plen - optlen,
 		    sizeof(struct ip6_hdr) + optlen);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (csum_flags & CSUM_SCTP_IPV6) {
 		sctp_delayed_cksum(m, sizeof(struct ip6_hdr) + optlen);
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 	}
 #endif
 }
 
 int
 ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto,
     int fraglen , uint32_t id)
 {
 	struct mbuf *m, **mnext, *m_frgpart;
 	struct ip6_hdr *ip6, *mhip6;
 	struct ip6_frag *ip6f;
 	int off;
 	int error;
 	int tlen = m0->m_pkthdr.len;
 
 	KASSERT((fraglen % 8 == 0), ("Fragment length must be a multiple of 8"));
 
 	m = m0;
 	ip6 = mtod(m, struct ip6_hdr *);
 	mnext = &m->m_nextpkt;
 
 	for (off = hlen; off < tlen; off += fraglen) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (!m) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 
 		/*
 		 * Make sure the complete packet header gets copied
 		 * from the originating mbuf to the newly created
 		 * mbuf. This also ensures that existing firewall
 		 * classification(s), VLAN tags and so on get copied
 		 * to the resulting fragmented packet(s):
 		 */
 		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
 			m_free(m);
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 		m->m_data += max_linkhdr;
 		mhip6 = mtod(m, struct ip6_hdr *);
 		*mhip6 = *ip6;
 		m->m_len = sizeof(*mhip6);
 		error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
 		if (error) {
 			IP6STAT_INC(ip6s_odropped);
 			return (error);
 		}
 		ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7));
 		if (off + fraglen >= tlen)
 			fraglen = tlen - off;
 		else
 			ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
 		mhip6->ip6_plen = htons((u_short)(fraglen + hlen +
 		    sizeof(*ip6f) - sizeof(struct ip6_hdr)));
 		if ((m_frgpart = m_copym(m0, off, fraglen, M_NOWAIT)) == NULL) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 		m_cat(m, m_frgpart);
 		m->m_pkthdr.len = fraglen + hlen + sizeof(*ip6f);
 		ip6f->ip6f_reserved = 0;
 		ip6f->ip6f_ident = id;
 		ip6f->ip6f_nxt = nextproto;
 		IP6STAT_INC(ip6s_ofragments);
 		in6_ifstat_inc(ifp, ifs6_out_fragcreat);
 	}
 
 	return (0);
 }
 
 static int
 ip6_output_send(struct inpcb *inp, struct ifnet *ifp, struct ifnet *origifp,
     struct mbuf *m, struct sockaddr_in6 *dst, struct route_in6 *ro,
     bool stamp_tag)
 {
 #ifdef KERN_TLS
 	struct ktls_session *tls = NULL;
 #endif
 	struct m_snd_tag *mst;
 	int error;
 
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	mst = NULL;
 
 #ifdef KERN_TLS
 	/*
 	 * If this is an unencrypted TLS record, save a reference to
 	 * the record.  This local reference is used to call
 	 * ktls_output_eagain after the mbuf has been freed (thus
 	 * dropping the mbuf's reference) in if_output.
 	 */
 	if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
 		tls = ktls_hold(m->m_next->m_epg_tls);
 		mst = tls->snd_tag;
 
 		/*
 		 * If a TLS session doesn't have a valid tag, it must
 		 * have had an earlier ifp mismatch, so drop this
 		 * packet.
 		 */
 		if (mst == NULL) {
 			m_freem(m);
 			error = EAGAIN;
 			goto done;
 		}
 		/*
 		 * Always stamp tags that include NIC ktls.
 		 */
 		stamp_tag = true;
 	}
 #endif
 #ifdef RATELIMIT
 	if (inp != NULL && mst == NULL) {
 		if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
 		    (inp->inp_snd_tag != NULL &&
 		    inp->inp_snd_tag->ifp != ifp))
 			in_pcboutput_txrtlmt(inp, ifp, m);
 
 		if (inp->inp_snd_tag != NULL)
 			mst = inp->inp_snd_tag;
 	}
 #endif
 	if (stamp_tag && mst != NULL) {
 		KASSERT(m->m_pkthdr.rcvif == NULL,
 		    ("trying to add a send tag to a forwarded packet"));
 		if (mst->ifp != ifp) {
 			m_freem(m);
 			error = EAGAIN;
 			goto done;
 		}
 
 		/* stamp send tag on mbuf */
 		m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
 		m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 	}
 
 	error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro);
 
 done:
 	/* Check for route change invalidating send tags. */
 #ifdef KERN_TLS
 	if (tls != NULL) {
 		if (error == EAGAIN)
 			error = ktls_output_eagain(inp, tls);
 		ktls_free(tls);
 	}
 #endif
 #ifdef RATELIMIT
 	if (error == EAGAIN)
 		in_pcboutput_eagain(inp);
 #endif
 	return (error);
 }
 
 /*
  * IP6 output.
  * The packet in mbuf chain m contains a skeletal IP6 header (with pri, len,
  * nxt, hlim, src, dst).
  * This function may modify ver and hlim only.
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route_in6 ro is present and has ro_nh initialized, route lookup would be
  * skipped and ro->ro_nh would be used. If ro is present but ro->ro_nh is NULL,
  * then result of route lookup is stored in ro->ro_nh.
  *
  * Type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and nd_ifinfo.linkmtu
  * is uint32_t.  So we use u_long to hold largest one, which is rt_mtu.
  *
  * ifpp - XXX: just for statistics
  */
 int
 ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
     struct route_in6 *ro, int flags, struct ip6_moptions *im6o,
     struct ifnet **ifpp, struct inpcb *inp)
 {
 	struct ip6_hdr *ip6;
 	struct ifnet *ifp, *origifp;
 	struct mbuf *m = m0;
 	struct mbuf *mprev;
 	struct route_in6 *ro_pmtu;
 	struct nhop_object *nh;
 	struct sockaddr_in6 *dst, sin6, src_sa, dst_sa;
 	struct in6_addr odst;
 	u_char *nexthdrp;
 	int tlen, len;
 	int error = 0;
 	int vlan_pcp = -1;
 	struct in6_ifaddr *ia = NULL;
 	u_long mtu;
 	int alwaysfrag, dontfrag;
 	u_int32_t optlen, plen = 0, unfragpartlen;
 	struct ip6_exthdrs exthdrs;
 	struct in6_addr src0, dst0;
 	u_int32_t zone;
 	bool hdrsplit;
 	int sw_csum, tso;
 	int needfiblookup;
 	uint32_t fibnum;
 	struct m_tag *fwd_tag = NULL;
 	uint32_t id;
 
 	NET_EPOCH_ASSERT();
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			/* Unconditionally set flowid. */
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 		if ((inp->inp_flags2 & INP_2PCP_SET) != 0)
 			vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >>
 			    INP_2PCP_SHIFT;
 #ifdef NUMA
 		m->m_pkthdr.numa_domain = inp->inp_numa_domain;
 #endif
 	}
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/*
 	 * IPSec checking which handles several cases.
 	 * FAST IPSEC: We re-injected the packet.
 	 * XXX: need scope argument.
 	 */
 	if (IPSEC_ENABLED(ipv6)) {
 		if ((error = IPSEC_OUTPUT(ipv6, m, inp)) != 0) {
 			if (error == EINPROGRESS)
 				error = 0;
 			goto done;
 		}
 	}
 #endif /* IPSEC */
 
 	/* Source address validation. */
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
 	    (flags & IPV6_UNSPECSRC) == 0) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
 		error = EOPNOTSUPP;
 		IP6STAT_INC(ip6s_badscope);
 		goto bad;
 	}
 
 	/*
 	 * If we are given packet options to add extension headers prepare them.
 	 * Calculate the total length of the extension header chain.
 	 * Keep the length of the unfragmentable part for fragmentation.
 	 */
 	bzero(&exthdrs, sizeof(exthdrs));
 	optlen = 0;
 	unfragpartlen = sizeof(struct ip6_hdr);
 	if (opt) {
 		/* Hop-by-Hop options header. */
 		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh, optlen);
 
 		/* Destination options header (1st part). */
 		if (opt->ip6po_rthdr) {
 #ifndef RTHDR_SUPPORT_IMPLEMENTED
 			/*
 			 * If there is a routing header, discard the packet
 			 * right away here. RH0/1 are obsolete and we do not
 			 * currently support RH2/3/4.
 			 * People trying to use RH253/254 may want to disable
 			 * this check.
 			 * The moment we do support any routing header (again)
 			 * this block should check the routing type more
 			 * selectively.
 			 */
 			error = EINVAL;
 			goto bad;
 #endif
 
 			/*
 			 * Destination options header (1st part).
 			 * This only makes sense with a routing header.
 			 * See Section 9.2 of RFC 3542.
 			 * Disabling this part just for MIP6 convenience is
 			 * a bad idea.  We need to think carefully about a
 			 * way to make the advanced API coexist with MIP6
 			 * options, which might automatically be inserted in
 			 * the kernel.
 			 */
 			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1,
 			    optlen);
 		}
 		/* Routing header. */
 		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr, optlen);
 
 		unfragpartlen += optlen;
 
 		/*
 		 * NOTE: we don't add AH/ESP length here (done in
 		 * ip6_ipsec_output()).
 		 */
 
 		/* Destination options header (2nd part). */
 		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2, optlen);
 	}
 
 	/*
 	 * If there is at least one extension header,
 	 * separate IP6 header from the payload.
 	 */
 	hdrsplit = false;
 	if (optlen) {
 		if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 			m = NULL;
 			goto freehdrs;
 		}
 		m = exthdrs.ip6e_ip6;
 		ip6 = mtod(m, struct ip6_hdr *);
 		hdrsplit = true;
 	}
 
 	/* Adjust mbuf packet header length. */
 	m->m_pkthdr.len += optlen;
 	plen = m->m_pkthdr.len - sizeof(*ip6);
 
 	/* If this is a jumbo payload, insert a jumbo payload option. */
 	if (plen > IPV6_MAXPACKET) {
 		if (!hdrsplit) {
 			if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
 				m = NULL;
 				goto freehdrs;
 			}
 			m = exthdrs.ip6e_ip6;
 			ip6 = mtod(m, struct ip6_hdr *);
 			hdrsplit = true;
 		}
 		if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0)
 			goto freehdrs;
 		ip6->ip6_plen = 0;
 	} else
 		ip6->ip6_plen = htons(plen);
 	nexthdrp = &ip6->ip6_nxt;
 
 	if (optlen) {
 		/*
 		 * Concatenate headers and fill in next header fields.
 		 * Here we have, on "m"
 		 *	IPv6 payload
 		 * and we insert headers accordingly.
 		 * Finally, we should be getting:
 		 *	IPv6 hbh dest1 rthdr ah* [esp* dest2 payload].
 		 *
 		 * During the header composing process "m" points to IPv6
 		 * header.  "mprev" points to an extension header prior to esp.
 		 */
 		mprev = m;
 
 		/*
 		 * We treat dest2 specially.  This makes IPsec processing
 		 * much easier.  The goal here is to make mprev point the
 		 * mbuf prior to dest2.
 		 *
 		 * Result: IPv6 dest2 payload.
 		 * m and mprev will point to IPv6 header.
 		 */
 		if (exthdrs.ip6e_dest2) {
 			if (!hdrsplit)
 				panic("%s:%d: assumption failed: "
 				    "hdr not split: hdrsplit %d exthdrs %p",
 				    __func__, __LINE__, hdrsplit, &exthdrs);
 			exthdrs.ip6e_dest2->m_next = m->m_next;
 			m->m_next = exthdrs.ip6e_dest2;
 			*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_DSTOPTS;
 		}
 
 		/*
 		 * Result: IPv6 hbh dest1 rthdr dest2 payload.
 		 * m will point to IPv6 header.  mprev will point to the
 		 * extension header prior to dest2 (rthdr in the above case).
 		 */
 		MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS);
 		MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
 			   IPPROTO_DSTOPTS);
 		MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
 			   IPPROTO_ROUTING);
 	}
 
 	IP6STAT_INC(ip6s_localout);
 
 	/* Route packet. */
 	ro_pmtu = ro;
 	if (opt && opt->ip6po_rthdr)
 		ro = &opt->ip6po_route;
 	if (ro != NULL)
 		dst = (struct sockaddr_in6 *)&ro->ro_dst;
 	else
 		dst = &sin6;
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 
 again:
 	/*
 	 * If specified, try to fill in the traffic class field.
 	 * Do not override if a non-zero value is already set.
 	 * We check the diffserv field and the ECN field separately.
 	 */
 	if (opt && opt->ip6po_tclass >= 0) {
 		int mask = 0;
 
 		if (IPV6_DSCP(ip6) == 0)
 			mask |= 0xfc;
 		if (IPV6_ECN(ip6) == 0)
 			mask |= 0x03;
 		if (mask != 0)
 			ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
 	}
 
 	/* Fill in or override the hop limit field, if necessary. */
 	if (opt && opt->ip6po_hlim != -1)
 		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
 	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (im6o != NULL)
 			ip6->ip6_hlim = im6o->im6o_multicast_hlim;
 		else
 			ip6->ip6_hlim = V_ip6_defmcasthlim;
 	}
 
 	if (ro == NULL || ro->ro_nh == NULL) {
 		bzero(dst, sizeof(*dst));
 		dst->sin6_family = AF_INET6;
 		dst->sin6_len = sizeof(*dst);
 		dst->sin6_addr = ip6->ip6_dst;
 	} 
 	/*
 	 * Validate route against routing table changes.
 	 * Make sure that the address family is set in route.
 	 */
 	nh = NULL;
 	ifp = NULL;
 	mtu = 0;
 	if (ro != NULL) {
 		if (ro->ro_nh != NULL && inp != NULL) {
 			ro->ro_dst.sin6_family = AF_INET6; /* XXX KASSERT? */
 			NH_VALIDATE((struct route *)ro, &inp->inp_rt_cookie,
 			    fibnum);
 		}
 		if (ro->ro_nh != NULL && fwd_tag == NULL &&
 		    (!NH_IS_VALID(ro->ro_nh) ||
 		    ro->ro_dst.sin6_family != AF_INET6 ||
 		    !IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)))
 			RO_INVALIDATE_CACHE(ro);
 
 		if (ro->ro_nh != NULL && fwd_tag == NULL &&
 		    ro->ro_dst.sin6_family == AF_INET6 &&
 		    IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) {
 			/* Nexthop is valid and contains valid ifp */
 			nh = ro->ro_nh;
 		} else {
 			if (ro->ro_lle)
 				LLE_FREE(ro->ro_lle);	/* zeros ro_lle */
 			ro->ro_lle = NULL;
 			if (fwd_tag == NULL) {
 				bzero(&dst_sa, sizeof(dst_sa));
 				dst_sa.sin6_family = AF_INET6;
 				dst_sa.sin6_len = sizeof(dst_sa);
 				dst_sa.sin6_addr = ip6->ip6_dst;
 			}
 			error = in6_selectroute(&dst_sa, opt, im6o, ro, &ifp,
 			    &nh, fibnum, m->m_pkthdr.flowid);
 			if (error != 0) {
 				IP6STAT_INC(ip6s_noroute);
 				if (ifp != NULL)
 					in6_ifstat_inc(ifp, ifs6_out_discard);
 				goto bad;
 			}
 			/*
 			 * At this point at least @ifp is not NULL
 			 * Can be the case when dst is multicast, link-local or
 			 * interface is explicitly specificed by the caller.
 			 */
 		}
 		if (nh == NULL) {
 			/*
 			 * If in6_selectroute() does not return a nexthop
 			 * dst may not have been updated.
 			 */
 			*dst = dst_sa;	/* XXX */
 			origifp = ifp;
 			mtu = ifp->if_mtu;
 		} else {
 			ifp = nh->nh_ifp;
 			origifp = nh->nh_aifp;
 			ia = (struct in6_ifaddr *)(nh->nh_ifa);
 			counter_u64_add(nh->nh_pksent, 1);
 		}
 	} else {
 		struct nhop_object *nh;
 		struct in6_addr kdst;
 		uint32_t scopeid;
 
 		if (fwd_tag == NULL) {
 			bzero(&dst_sa, sizeof(dst_sa));
 			dst_sa.sin6_family = AF_INET6;
 			dst_sa.sin6_len = sizeof(dst_sa);
 			dst_sa.sin6_addr = ip6->ip6_dst;
 		}
 
 		if (IN6_IS_ADDR_MULTICAST(&dst_sa.sin6_addr) &&
 		    im6o != NULL &&
 		    (ifp = im6o->im6o_multicast_ifp) != NULL) {
 			/* We do not need a route lookup. */
 			*dst = dst_sa;	/* XXX */
 			origifp = ifp;
 			goto nonh6lookup;
 		}
 
 		in6_splitscope(&dst_sa.sin6_addr, &kdst, &scopeid);
 
 		if (IN6_IS_ADDR_MC_LINKLOCAL(&dst_sa.sin6_addr) ||
 		    IN6_IS_ADDR_MC_NODELOCAL(&dst_sa.sin6_addr)) {
 			if (scopeid > 0) {
 				ifp = in6_getlinkifnet(scopeid);
 				if (ifp == NULL) {
 					error = EHOSTUNREACH;
 					goto bad;
 				}
 				*dst = dst_sa;	/* XXX */
 				origifp = ifp;
 				goto nonh6lookup;
 			}
 		}
 
 		nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE,
 		    m->m_pkthdr.flowid);
 		if (nh == NULL) {
 			IP6STAT_INC(ip6s_noroute);
 			/* No ifp in6_ifstat_inc(ifp, ifs6_out_discard); */
 			error = EHOSTUNREACH;
 			goto bad;
 		}
 
 		ifp = nh->nh_ifp;
 		origifp = nh->nh_aifp;
 		ia = ifatoia6(nh->nh_ifa);
 		if (nh->nh_flags & NHF_GATEWAY)
 			dst->sin6_addr = nh->gw6_sa.sin6_addr;
 		else if (fwd_tag != NULL)
 			dst->sin6_addr = dst_sa.sin6_addr;
 nonh6lookup:
 		;
 	}
 	/*
 	 * At this point ifp MUST be pointing to the valid transmit ifp.
 	 * origifp MUST be valid and pointing to either the same ifp or,
 	 * in case of loopback output, to the interface which ip6_src
 	 * belongs to.
 	 * Examples:
 	 *  fe80::1%em0 -> fe80::2%em0 -> ifp=em0, origifp=em0
 	 *  fe80::1%em0 -> fe80::1%em0 -> ifp=lo0, origifp=em0
 	 *  ::1 -> ::1 -> ifp=lo0, origifp=lo0
 	 *
 	 * mtu can be 0 and will be refined later.
 	 */
 	KASSERT((ifp != NULL), ("output interface must not be NULL"));
 	KASSERT((origifp != NULL), ("output address interface must not be NULL"));
 
 	if ((flags & IPV6_FORWARDING) == 0) {
 		/* XXX: the FORWARDING flag can be set for mrouting. */
 		in6_ifstat_inc(ifp, ifs6_out_request);
 	}
 
 	/* Setup data structures for scope ID checks. */
 	src0 = ip6->ip6_src;
 	bzero(&src_sa, sizeof(src_sa));
 	src_sa.sin6_family = AF_INET6;
 	src_sa.sin6_len = sizeof(src_sa);
 	src_sa.sin6_addr = ip6->ip6_src;
 
 	dst0 = ip6->ip6_dst;
 	/* Re-initialize to be sure. */
 	bzero(&dst_sa, sizeof(dst_sa));
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(dst_sa);
 	dst_sa.sin6_addr = ip6->ip6_dst;
 
 	/* Check for valid scope ID. */
 	if (in6_setscope(&src0, origifp, &zone) == 0 &&
 	    sa6_recoverscope(&src_sa) == 0 && zone == src_sa.sin6_scope_id &&
 	    in6_setscope(&dst0, origifp, &zone) == 0 &&
 	    sa6_recoverscope(&dst_sa) == 0 && zone == dst_sa.sin6_scope_id) {
 		/*
 		 * The outgoing interface is in the zone of the source
 		 * and destination addresses.
 		 *
 		 */
 	} else if ((origifp->if_flags & IFF_LOOPBACK) == 0 ||
 	    sa6_recoverscope(&src_sa) != 0 ||
 	    sa6_recoverscope(&dst_sa) != 0 ||
 	    dst_sa.sin6_scope_id == 0 ||
 	    (src_sa.sin6_scope_id != 0 &&
 	    src_sa.sin6_scope_id != dst_sa.sin6_scope_id) ||
 	    ifnet_byindex(dst_sa.sin6_scope_id) == NULL) {
 		/*
 		 * If the destination network interface is not a
 		 * loopback interface, or the destination network
 		 * address has no scope ID, or the source address has
 		 * a scope ID set which is different from the
 		 * destination address one, or there is no network
 		 * interface representing this scope ID, the address
 		 * pair is considered invalid.
 		 */
 		IP6STAT_INC(ip6s_badscope);
 		in6_ifstat_inc(origifp, ifs6_out_discard);
 		if (error == 0)
 			error = EHOSTUNREACH; /* XXX */
 		goto bad;
 	}
 	/* All scope ID checks are successful. */
 
 	if (nh && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (opt && opt->ip6po_nextroute.ro_nh) {
 			/*
 			 * The nexthop is explicitly specified by the
 			 * application.  We assume the next hop is an IPv6
 			 * address.
 			 */
 			dst = (struct sockaddr_in6 *)opt->ip6po_nexthop;
 		}
 		else if ((nh->nh_flags & NHF_GATEWAY))
 			dst = &nh->gw6_sa;
 	}
 
 	if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		m->m_flags &= ~(M_BCAST | M_MCAST); /* Just in case. */
 	} else {
 		m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
 		in6_ifstat_inc(ifp, ifs6_out_mcast);
 
 		/* Confirm that the outgoing interface supports multicast. */
 		if (!(ifp->if_flags & IFF_MULTICAST)) {
 			IP6STAT_INC(ip6s_noroute);
 			in6_ifstat_inc(ifp, ifs6_out_discard);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		if ((im6o == NULL && in6_mcast_loop) ||
 		    (im6o && im6o->im6o_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we have not joined
 			 * the address; protocols will filter it later,
 			 * thus deferring a hash lookup and lock acquisition
 			 * at the expense of an m_copym().
 			 */
 			ip6_mloopback(ifp, m);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IPV6_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip6_mloopback(),
 			 * above, will be forwarded by the ip6_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) {
 				/*
 				 * XXX: ip6_mforward expects that rcvif is NULL
 				 * when it is called from the originating path.
 				 * However, it may not always be the case.
 				 */
 				m->m_pkthdr.rcvif = NULL;
 				if (ip6_mforward(ip6, ifp, m) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 		/*
 		 * Multicasts with a hoplimit of zero may be looped back,
 		 * above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip6_mloopback() will
 		 * loop back a copy if this host actually belongs to the
 		 * destination group on the loopback interface.
 		 */
 		if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) ||
 		    IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
 			m_freem(m);
 			goto done;
 		}
 	}
 
 	/*
 	 * Fill the outgoing inteface to tell the upper layer
 	 * to increment per-interface statistics.
 	 */
 	if (ifpp)
 		*ifpp = ifp;
 
 	/* Determine path MTU. */
 	if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst,
 		    &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0)
 		goto bad;
 	KASSERT(mtu > 0, ("%s:%d: mtu %ld, ro_pmtu %p ro %p ifp %p "
 	    "alwaysfrag %d fibnum %u\n", __func__, __LINE__, mtu, ro_pmtu, ro,
 	    ifp, alwaysfrag, fibnum));
 
 	/*
 	 * The caller of this function may specify to use the minimum MTU
 	 * in some cases.
 	 * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
 	 * setting.  The logic is a bit complicated; by default, unicast
 	 * packets will follow path MTU while multicast packets will be sent at
 	 * the minimum MTU.  If IP6PO_MINMTU_ALL is specified, all packets
 	 * including unicast ones will be sent at the minimum MTU.  Multicast
 	 * packets will always be sent at the minimum MTU unless
 	 * IP6PO_MINMTU_DISABLE is explicitly specified.
 	 * See RFC 3542 for more details.
 	 */
 	if (mtu > IPV6_MMTU) {
 		if ((flags & IPV6_MINMTU))
 			mtu = IPV6_MMTU;
 		else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
 			mtu = IPV6_MMTU;
 		else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 			 (opt == NULL ||
 			  opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
 			mtu = IPV6_MMTU;
 		}
 	}
 
 	/*
 	 * Clear embedded scope identifiers if necessary.
 	 * in6_clearscope() will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	/*
 	 * If the outgoing packet contains a hop-by-hop options header,
 	 * it must be examined and processed even by the source node.
 	 * (RFC 2460, section 4.)
 	 */
 	if (exthdrs.ip6e_hbh) {
 		struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *);
 		u_int32_t dummy; /* XXX unused */
 		u_int32_t plen = 0; /* XXX: ip6_process will check the value */
 
 #ifdef DIAGNOSTIC
 		if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len)
 			panic("ip6e_hbh is not contiguous");
 #endif
 		/*
 		 *  XXX: if we have to send an ICMPv6 error to the sender,
 		 *       we need the M_LOOP flag since icmp6_error() expects
 		 *       the IPv6 and the hop-by-hop options header are
 		 *       contiguous unless the flag is set.
 		 */
 		m->m_flags |= M_LOOP;
 		m->m_pkthdr.rcvif = ifp;
 		if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1),
 		    ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh),
 		    &dummy, &plen) < 0) {
 			/* m was already freed at this point. */
 			error = EINVAL;/* better error? */
 			goto done;
 		}
 		m->m_flags &= ~M_LOOP; /* XXX */
 		m->m_pkthdr.rcvif = NULL;
 	}
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (!PFIL_HOOKED_OUT(V_inet6_pfil_head))
 		goto passout;
 
 	odst = ip6->ip6_dst;
 	/* Run through list of hooks for output packets. */
 	switch (pfil_mbuf_out(V_inet6_pfil_head, &m, ifp, inp)) {
 	case PFIL_PASS:
 		ip6 = mtod(m, struct ip6_hdr *);
 		break;
 	case PFIL_DROPPED:
 		error = EACCES;
 		/* FALLTHROUGH */
 	case PFIL_CONSUMED:
 		goto done;
 	}
 
 	needfiblookup = 0;
 	/* See if destination IP address was changed by packet filter. */
 	if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip6_input(). */
 		if (in6_localip(&ip6->ip6_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 				m->m_pkthdr.csum_flags |=
 				    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			error = netisr_queue(NETISR_IPV6, m);
 			goto done;
 		} else {
 			if (ro != NULL)
 				RO_INVALIDATE_CACHE(ro);
 			needfiblookup = 1; /* Redo the routing table lookup. */
 		}
 	}
 	/* See if fib was changed by packet filter. */
 	if (fibnum != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		fibnum = M_GETFIB(m);
 		if (ro != NULL)
 			RO_INVALIDATE_CACHE(ro);
 		needfiblookup = 1;
 	}
 	if (needfiblookup)
 		goto again;
 
 	/* See if local, if yes, send it to netisr. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 			m->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		error = netisr_queue(NETISR_IPV6, m);
 		goto done;
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		if (ro != NULL)
 			dst = (struct sockaddr_in6 *)&ro->ro_dst;
 		else
 			dst = &sin6;
 		bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP6_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 		goto again;
 	}
 
 passout:
 	if (vlan_pcp > -1)
 		EVL_APPLY_PRI(m, vlan_pcp);
 
 	/* Ensure the packet data is mapped if the interface requires it. */
 	if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
 		m = mb_unmapped_to_ext(m);
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_odropped);
 			return (ENOBUFS);
 		}
 	}
 
 	/*
 	 * Send the packet to the outgoing interface.
 	 * If necessary, do IPv6 fragmentation before sending.
 	 *
 	 * The logic here is rather complex:
 	 * 1: normal case (dontfrag == 0, alwaysfrag == 0)
 	 * 1-a:	send as is if tlen <= path mtu
 	 * 1-b:	fragment if tlen > path mtu
 	 *
 	 * 2: if user asks us not to fragment (dontfrag == 1)
 	 * 2-a:	send as is if tlen <= interface mtu
 	 * 2-b:	error if tlen > interface mtu
 	 *
 	 * 3: if we always need to attach fragment header (alwaysfrag == 1)
 	 *	always fragment
 	 *
 	 * 4: if dontfrag == 1 && alwaysfrag == 1
 	 *	error, as we cannot handle this conflicting request.
 	 */
 	sw_csum = m->m_pkthdr.csum_flags;
 	if (!hdrsplit) {
 		tso = ((sw_csum & ifp->if_hwassist &
 		    (CSUM_TSO | CSUM_INNER_TSO)) != 0) ? 1 : 0;
 		sw_csum &= ~ifp->if_hwassist;
 	} else
 		tso = 0;
 	/*
 	 * If we added extension headers, we will not do TSO and calculate the
 	 * checksums ourselves for now.
 	 * XXX-BZ  Need a framework to know when the NIC can handle it, even
 	 * with ext. hdrs.
 	 */
 	ip6_output_delayed_csum(m, ifp, sw_csum, plen, optlen);
 	/* XXX-BZ m->m_pkthdr.csum_flags &= ~ifp->if_hwassist; */
 	tlen = m->m_pkthdr.len;
 
 	if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso)
 		dontfrag = 1;
 	else
 		dontfrag = 0;
 	if (dontfrag && alwaysfrag) {	/* Case 4. */
 		/* Conflicting request - can't transmit. */
 		error = EMSGSIZE;
 		goto bad;
 	}
 	if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) {	/* Case 2-b. */
 		/*
 		 * Even if the DONTFRAG option is specified, we cannot send the
 		 * packet when the data length is larger than the MTU of the
 		 * outgoing interface.
 		 * Notify the error by sending IPV6_PATHMTU ancillary data if
 		 * application wanted to know the MTU value. Also return an
 		 * error code (this is not described in the API spec).
 		 */
 		if (inp != NULL)
 			ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu);
 		error = EMSGSIZE;
 		goto bad;
 	}
 
 	/* Transmit packet without fragmentation. */
 	if (dontfrag || (!alwaysfrag && tlen <= mtu)) {	/* Cases 1-a and 2-a. */
 		struct in6_ifaddr *ia6;
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
 		if (ia6) {
 			/* Record statistics for this interface address. */
 			counter_u64_add(ia6->ia_ifa.ifa_opackets, 1);
 			counter_u64_add(ia6->ia_ifa.ifa_obytes,
 			    m->m_pkthdr.len);
 		}
 		error = ip6_output_send(inp, ifp, origifp, m, dst, ro,
 		    (flags & IP_NO_SND_TAG_RL) ? false : true);
 		goto done;
 	}
 
 	/* Try to fragment the packet.  Cases 1-b and 3. */
 	if (mtu < IPV6_MMTU) {
 		/* Path MTU cannot be less than IPV6_MMTU. */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else if (ip6->ip6_plen == 0) {
 		/* Jumbo payload cannot be fragmented. */
 		error = EMSGSIZE;
 		in6_ifstat_inc(ifp, ifs6_out_fragfail);
 		goto bad;
 	} else {
 		u_char nextproto;
 
 		/*
 		 * Too large for the destination or interface;
 		 * fragment if possible.
 		 * Must be able to put at least 8 bytes per fragment.
 		 */
 		if (mtu > IPV6_MAXPACKET)
 			mtu = IPV6_MAXPACKET;
 
 		len = (mtu - unfragpartlen - sizeof(struct ip6_frag)) & ~7;
 		if (len < 8) {
 			error = EMSGSIZE;
 			in6_ifstat_inc(ifp, ifs6_out_fragfail);
 			goto bad;
 		}
 
 		/*
 		 * If the interface will not calculate checksums on
 		 * fragmented packets, then do it here.
 		 * XXX-BZ handle the hw offloading case.  Need flags.
 		 */
 		ip6_output_delayed_csum(m, ifp, m->m_pkthdr.csum_flags, plen,
 		    optlen);
 
 		/*
 		 * Change the next header field of the last header in the
 		 * unfragmentable part.
 		 */
 		if (exthdrs.ip6e_rthdr) {
 			nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
 			*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_dest1) {
 			nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
 			*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
 		} else if (exthdrs.ip6e_hbh) {
 			nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
 			*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
 		} else {
 			ip6 = mtod(m, struct ip6_hdr *);
 			nextproto = ip6->ip6_nxt;
 			ip6->ip6_nxt = IPPROTO_FRAGMENT;
 		}
 
 		/*
 		 * Loop through length of segment after first fragment,
 		 * make new header and copy data of each part and link onto
 		 * chain.
 		 */
 		m0 = m;
 		id = htonl(ip6_randomid());
 		error = ip6_fragment(ifp, m, unfragpartlen, nextproto,len, id);
 		if (error != 0)
 			goto sendorfree;
 
 		in6_ifstat_inc(ifp, ifs6_out_fragok);
 	}
 
 	/* Remove leading garbage. */
 sendorfree:
 	m = m0->m_nextpkt;
 	m0->m_nextpkt = 0;
 	m_freem(m0);
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			if (vlan_pcp > -1)
 				EVL_APPLY_PRI(m, vlan_pcp);
 			error = ip6_output_send(inp, ifp, origifp, m, dst, ro,
 			    true);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IP6STAT_INC(ip6s_fragmented);
 
 done:
 	return (error);
 
 freehdrs:
 	m_freem(exthdrs.ip6e_hbh);	/* m_freem() checks if mbuf is NULL. */
 	m_freem(exthdrs.ip6e_dest1);
 	m_freem(exthdrs.ip6e_rthdr);
 	m_freem(exthdrs.ip6e_dest2);
 	/* FALLTHROUGH */
 bad:
 	if (m)
 		m_freem(m);
 	goto done;
 }
 
 static int
 ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen)
 {
 	struct mbuf *m;
 
 	if (hlen > MCLBYTES)
 		return (ENOBUFS); /* XXX */
 
 	if (hlen > MLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, 0);
 	else
 		m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_len = hlen;
 	if (hdr)
 		bcopy(hdr, mtod(m, caddr_t), hlen);
 
 	*mp = m;
 	return (0);
 }
 
 /*
  * Insert jumbo payload option.
  */
 static int
 ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
 {
 	struct mbuf *mopt;
 	u_char *optbuf;
 	u_int32_t v;
 
 #define JUMBOOPTLEN	8	/* length of jumbo payload option and padding */
 
 	/*
 	 * If there is no hop-by-hop options header, allocate new one.
 	 * If there is one but it doesn't have enough space to store the
 	 * jumbo payload option, allocate a cluster to store the whole options.
 	 * Otherwise, use it to store the options.
 	 */
 	if (exthdrs->ip6e_hbh == NULL) {
 		mopt = m_get(M_NOWAIT, MT_DATA);
 		if (mopt == NULL)
 			return (ENOBUFS);
 		mopt->m_len = JUMBOOPTLEN;
 		optbuf = mtod(mopt, u_char *);
 		optbuf[1] = 0;	/* = ((JUMBOOPTLEN) >> 3) - 1 */
 		exthdrs->ip6e_hbh = mopt;
 	} else {
 		struct ip6_hbh *hbh;
 
 		mopt = exthdrs->ip6e_hbh;
 		if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
 			/*
 			 * XXX assumption:
 			 * - exthdrs->ip6e_hbh is not referenced from places
 			 *   other than exthdrs.
 			 * - exthdrs->ip6e_hbh is not an mbuf chain.
 			 */
 			int oldoptlen = mopt->m_len;
 			struct mbuf *n;
 
 			/*
 			 * XXX: give up if the whole (new) hbh header does
 			 * not fit even in an mbuf cluster.
 			 */
 			if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
 				return (ENOBUFS);
 
 			/*
 			 * As a consequence, we must always prepare a cluster
 			 * at this point.
 			 */
 			n = m_getcl(M_NOWAIT, MT_DATA, 0);
 			if (n == NULL)
 				return (ENOBUFS);
 			n->m_len = oldoptlen + JUMBOOPTLEN;
 			bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t),
 			    oldoptlen);
 			optbuf = mtod(n, caddr_t) + oldoptlen;
 			m_freem(mopt);
 			mopt = exthdrs->ip6e_hbh = n;
 		} else {
 			optbuf = mtod(mopt, u_char *) + mopt->m_len;
 			mopt->m_len += JUMBOOPTLEN;
 		}
 		optbuf[0] = IP6OPT_PADN;
 		optbuf[1] = 1;
 
 		/*
 		 * Adjust the header length according to the pad and
 		 * the jumbo payload option.
 		 */
 		hbh = mtod(mopt, struct ip6_hbh *);
 		hbh->ip6h_len += (JUMBOOPTLEN >> 3);
 	}
 
 	/* fill in the option. */
 	optbuf[2] = IP6OPT_JUMBO;
 	optbuf[3] = 4;
 	v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
 	bcopy(&v, &optbuf[4], sizeof(u_int32_t));
 
 	/* finally, adjust the packet header length */
 	exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
 
 	return (0);
 #undef JUMBOOPTLEN
 }
 
 /*
  * Insert fragment header and copy unfragmentable header portions.
  */
 static int
 ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
     struct ip6_frag **frghdrp)
 {
 	struct mbuf *n, *mlast;
 
 	if (hlen > sizeof(struct ip6_hdr)) {
 		n = m_copym(m0, sizeof(struct ip6_hdr),
 		    hlen - sizeof(struct ip6_hdr), M_NOWAIT);
 		if (n == NULL)
 			return (ENOBUFS);
 		m->m_next = n;
 	} else
 		n = m;
 
 	/* Search for the last mbuf of unfragmentable part. */
 	for (mlast = n; mlast->m_next; mlast = mlast->m_next)
 		;
 
 	if (M_WRITABLE(mlast) &&
 	    M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
 		/* use the trailing space of the last mbuf for the fragment hdr */
 		*frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) +
 		    mlast->m_len);
 		mlast->m_len += sizeof(struct ip6_frag);
 		m->m_pkthdr.len += sizeof(struct ip6_frag);
 	} else {
 		/* allocate a new mbuf for the fragment header */
 		struct mbuf *mfrg;
 
 		mfrg = m_get(M_NOWAIT, MT_DATA);
 		if (mfrg == NULL)
 			return (ENOBUFS);
 		mfrg->m_len = sizeof(struct ip6_frag);
 		*frghdrp = mtod(mfrg, struct ip6_frag *);
 		mlast->m_next = mfrg;
 	}
 
 	return (0);
 }
 
 /*
  * Calculates IPv6 path mtu for destination @dst.
  * Resulting MTU is stored in @mtup.
  *
  * Returns 0 on success.
  */
 static int
 ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup)
 {
 	struct epoch_tracker et;
 	struct nhop_object *nh;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 	int error;
 
 	in6_splitscope(dst, &kdst, &scopeid);
 
 	NET_EPOCH_ENTER(et);
 	nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, 0);
 	if (nh != NULL)
 		error = ip6_calcmtu(nh->nh_ifp, dst, nh->nh_mtu, mtup, NULL, 0);
 	else
 		error = EHOSTUNREACH;
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * Calculates IPv6 path MTU for @dst based on transmit @ifp,
  * and cached data in @ro_pmtu.
  * MTU from (successful) route lookup is saved (along with dst)
  * inside @ro_pmtu to avoid subsequent route lookups after packet
  * filter processing.
  *
  * Stores mtu and always-frag value into @mtup and @alwaysfragp.
  * Returns 0 on success.
  */
 static int
 ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup,
     struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup,
     int *alwaysfragp, u_int fibnum, u_int proto)
 {
 	struct nhop_object *nh;
 	struct in6_addr kdst;
 	uint32_t scopeid;
 	struct sockaddr_in6 *sa6_dst, sin6;
 	u_long mtu;
 
 	NET_EPOCH_ASSERT();
 
 	mtu = 0;
 	if (ro_pmtu == NULL || do_lookup) {
 		/*
 		 * Here ro_pmtu has final destination address, while
 		 * ro might represent immediate destination.
 		 * Use ro_pmtu destination since mtu might differ.
 		 */
 		if (ro_pmtu != NULL) {
 			sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst;
 			if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))
 				ro_pmtu->ro_mtu = 0;
 		} else
 			sa6_dst = &sin6;
 
 		if (ro_pmtu == NULL || ro_pmtu->ro_mtu == 0) {
 			bzero(sa6_dst, sizeof(*sa6_dst));
 			sa6_dst->sin6_family = AF_INET6;
 			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
 			sa6_dst->sin6_addr = *dst;
 
 			in6_splitscope(dst, &kdst, &scopeid);
 			nh = fib6_lookup(fibnum, &kdst, scopeid, NHR_NONE, 0);
 			if (nh != NULL) {
 				mtu = nh->nh_mtu;
 				if (ro_pmtu != NULL)
 					ro_pmtu->ro_mtu = mtu;
 			}
 		} else
 			mtu = ro_pmtu->ro_mtu;
 	}
 
 	if (ro_pmtu != NULL && ro_pmtu->ro_nh != NULL)
 		mtu = ro_pmtu->ro_nh->nh_mtu;
 
 	return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto));
 }
 
 /*
  * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and
  * hostcache data for @dst.
  * Stores mtu and always-frag value into @mtup and @alwaysfragp.
  *
  * Returns 0 on success.
  */
 static int
 ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu,
     u_long *mtup, int *alwaysfragp, u_int proto)
 {
 	u_long mtu = 0;
 	int alwaysfrag = 0;
 	int error = 0;
 
 	if (rt_mtu > 0) {
 		u_int32_t ifmtu;
 		struct in_conninfo inc;
 
 		bzero(&inc, sizeof(inc));
 		inc.inc_flags |= INC_ISIPV6;
 		inc.inc6_faddr = *dst;
 
 		ifmtu = IN6_LINKMTU(ifp);
 
 		/* TCP is known to react to pmtu changes so skip hc */
 		if (proto != IPPROTO_TCP)
 			mtu = tcp_hc_getmtu(&inc);
 
 		if (mtu)
 			mtu = min(mtu, rt_mtu);
 		else
 			mtu = rt_mtu;
 		if (mtu == 0)
 			mtu = ifmtu;
 		else if (mtu < IPV6_MMTU) {
 			/*
 			 * RFC2460 section 5, last paragraph:
 			 * if we record ICMPv6 too big message with
 			 * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
 			 * or smaller, with framgent header attached.
 			 * (fragment header is needed regardless from the
 			 * packet size, for translators to identify packets)
 			 */
 			alwaysfrag = 1;
 			mtu = IPV6_MMTU;
 		}
 	} else if (ifp) {
 		mtu = IN6_LINKMTU(ifp);
 	} else
 		error = EHOSTUNREACH; /* XXX */
 
 	*mtup = mtu;
 	if (alwaysfragp)
 		*alwaysfragp = alwaysfrag;
 	return (error);
 }
 
 /*
  * IP6 socket option processing.
  */
 int
 ip6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int optdatalen, uproto;
 	void *optdata;
 	struct inpcb *inp = sotoinpcb(so);
 	int error, optval;
 	int level, op, optname;
 	int optlen;
 	struct thread *td;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 /*
  * Don't use more than a quarter of mbuf clusters.  N.B.:
  * nmbclusters is an int, but nmbclusters * MCLBYTES may overflow
  * on LP64 architectures, so cast to u_long to avoid undefined
  * behavior.  ILP32 architectures cannot have nmbclusters
  * large enough to overflow for other reasons.
  */
 #define IPV6_PKTOPTIONS_MBUF_LIMIT	((u_long)nmbclusters * MCLBYTES / 4)
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 	td = sopt->sopt_td;
 	error = 0;
 	optval = 0;
 	uproto = (int)so->so_proto->pr_protocol;
 
 	if (level != IPPROTO_IPV6) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					inp->inp_flags2 |= INP_REUSEADDR;
 				else
 					inp->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT_LB:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT_LB) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT_LB;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT_LB;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_MAX_PACING_RATE:
 #ifdef RATELIMIT
 				INP_WLOCK(inp);
 				inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 				INP_WUNLOCK(inp);
 				error = 0;
 #else
 				error = EOPNOTSUPP;
 #endif
 				break;
 			default:
 				break;
 			}
 		}
 	} else {		/* level == IPPROTO_IPV6 */
 		switch (op) {
 		case SOPT_SET:
 			switch (optname) {
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 			{
 				struct mbuf *m;
 
 				if (optlen > IPV6_PKTOPTIONS_MBUF_LIMIT) {
 					printf("ip6_ctloutput: mbuf limit hit\n");
 					error = ENOBUFS;
 					break;
 				}
 
 				error = soopt_getm(sopt, &m); /* XXX */
 				if (error != 0)
 					break;
 				error = soopt_mcopyin(sopt, m); /* XXX */
 				if (error != 0)
 					break;
 				INP_WLOCK(inp);
 				error = ip6_pcbopts(&inp->in6p_outputopts, m,
 				    so, sopt);
 				INP_WUNLOCK(inp);
 				m_freem(m); /* XXX */
 				break;
 			}
 
 			/*
 			 * Use of some Hop-by-Hop options or some
 			 * Destination options, might require special
 			 * privilege.  That is, normal applications
 			 * (without special privilege) might be forbidden
 			 * from setting certain options in outgoing packets,
 			 * and might never see certain options in received
 			 * packets. [RFC 2292 Section 6]
 			 * KAME specific note:
 			 *  KAME prevents non-privileged users from sending or
 			 *  receiving ANY hbh/dst options in order to avoid
 			 *  overhead of parsing options in the kernel.
 			 */
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 				if (td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_SETHDROPTS);
 					if (error)
 						break;
 				}
 				/* FALLTHROUGH */
 			case IPV6_UNICAST_HOPS:
 			case IPV6_HOPLIMIT:
 
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 			case IPV6_RECVTCLASS:
 			case IPV6_RECVFLOWID:
 #ifdef	RSS
 			case IPV6_RECVRSSBUCKETID:
 #endif
 			case IPV6_V6ONLY:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_ORIGDSTADDR:
 			case IPV6_BINDANY:
 			case IPV6_BINDMULTI:
 #ifdef	RSS
 			case IPV6_RSS_LISTEN_BUCKET:
 #endif
 			case IPV6_VLAN_PCP:
 				if (optname == IPV6_BINDANY && td != NULL) {
 					error = priv_check(td,
 					    PRIV_NETINET_BINDANY);
 					if (error)
 						break;
 				}
 
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 				case IPV6_UNICAST_HOPS:
 					if (optval < -1 || optval >= 256)
 						error = EINVAL;
 					else {
 						/* -1 = kernel default */
 						inp->in6p_hops = optval;
 						if ((inp->inp_vflag &
 						     INP_IPV4) != 0)
 							inp->inp_ip_ttl = optval;
 					}
 					break;
 #define OPTSET(bit) \
 do { \
 	INP_WLOCK(inp); \
 	if (optval) \
 		inp->inp_flags |= (bit); \
 	else \
 		inp->inp_flags &= ~(bit); \
 	INP_WUNLOCK(inp); \
 } while (/*CONSTCOND*/ 0)
 #define OPTSET2292(bit) \
 do { \
 	INP_WLOCK(inp); \
 	inp->inp_flags |= IN6P_RFC2292; \
 	if (optval) \
 		inp->inp_flags |= (bit); \
 	else \
 		inp->inp_flags &= ~(bit); \
 	INP_WUNLOCK(inp); \
 } while (/*CONSTCOND*/ 0)
 #define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0)
 
 #define OPTSET2_N(bit, val) do {					\
 	if (val)							\
 		inp->inp_flags2 |= bit;					\
 	else								\
 		inp->inp_flags2 &= ~bit;				\
 } while (0)
 #define OPTSET2(bit, val) do {						\
 	INP_WLOCK(inp);							\
 	OPTSET2_N(bit, val);						\
 	INP_WUNLOCK(inp);						\
 } while (0)
 #define OPTBIT2(bit) (inp->inp_flags2 & (bit) ? 1 : 0)
 #define OPTSET2292_EXCLUSIVE(bit)					\
 do {									\
 	INP_WLOCK(inp);							\
 	if (OPTBIT(IN6P_RFC2292)) {					\
 		error = EINVAL;						\
 	} else {							\
 		if (optval)						\
 			inp->inp_flags |= (bit);			\
 		else							\
 			inp->inp_flags &= ~(bit);			\
 	}								\
 	INP_WUNLOCK(inp);						\
 } while (/*CONSTCOND*/ 0)
 
 				case IPV6_RECVPKTINFO:
 					OPTSET2292_EXCLUSIVE(IN6P_PKTINFO);
 					break;
 
 				case IPV6_HOPLIMIT:
 				{
 					struct ip6_pktopts **optp;
 
 					/* cannot mix with RFC2292 */
 					if (OPTBIT(IN6P_RFC2292)) {
 						error = EINVAL;
 						break;
 					}
 					INP_WLOCK(inp);
 					if (inp->inp_flags & INP_DROPPED) {
 						INP_WUNLOCK(inp);
 						return (ECONNRESET);
 					}
 					optp = &inp->in6p_outputopts;
 					error = ip6_pcbopt(IPV6_HOPLIMIT,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					INP_WUNLOCK(inp);
 					break;
 				}
 
 				case IPV6_RECVHOPLIMIT:
 					OPTSET2292_EXCLUSIVE(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVHOPOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					OPTSET2292_EXCLUSIVE(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDR:
 					OPTSET2292_EXCLUSIVE(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					/*
 					 * We ignore this option for TCP
 					 * sockets.
 					 * (RFC3542 leaves this case
 					 * unspecified.)
 					 */
 					if (uproto != IPPROTO_TCP)
 						OPTSET(IN6P_MTU);
 					break;
 
 				case IPV6_RECVFLOWID:
 					OPTSET2(INP_RECVFLOWID, optval);
 					break;
 
 #ifdef	RSS
 				case IPV6_RECVRSSBUCKETID:
 					OPTSET2(INP_RECVRSSBUCKETID, optval);
 					break;
 #endif
 
 				case IPV6_V6ONLY:
 					INP_WLOCK(inp);
 					if (inp->inp_lport ||
 					    !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
 						/*
 						 * The socket is already bound.
 						 */
 						INP_WUNLOCK(inp);
 						error = EINVAL;
 						break;
 					}
 					if (optval) {
 						inp->inp_flags |= IN6P_IPV6_V6ONLY;
 						inp->inp_vflag &= ~INP_IPV4;
 					} else {
 						inp->inp_flags &= ~IN6P_IPV6_V6ONLY;
 						inp->inp_vflag |= INP_IPV4;
 					}
 					INP_WUNLOCK(inp);
 					break;
 				case IPV6_RECVTCLASS:
 					/* cannot mix with RFC2292 XXX */
 					OPTSET2292_EXCLUSIVE(IN6P_TCLASS);
 					break;
 				case IPV6_AUTOFLOWLABEL:
 					OPTSET(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_ORIGDSTADDR:
 					OPTSET2(INP_ORIGDSTADDR, optval);
 					break;
 				case IPV6_BINDANY:
 					OPTSET(INP_BINDANY);
 					break;
 
 				case IPV6_BINDMULTI:
 					OPTSET2(INP_BINDMULTI, optval);
 					break;
 #ifdef	RSS
 				case IPV6_RSS_LISTEN_BUCKET:
 					if ((optval >= 0) &&
 					    (optval < rss_getnumbuckets())) {
 						INP_WLOCK(inp);
 						inp->inp_rss_listen_bucket = optval;
 						OPTSET2_N(INP_RSS_BUCKET_SET, 1);
 						INP_WUNLOCK(inp);
 					} else {
 						error = EINVAL;
 					}
 					break;
 #endif
 				case IPV6_VLAN_PCP:
 					if ((optval >= -1) && (optval <=
 					    (INP_2PCP_MASK >> INP_2PCP_SHIFT))) {
 						if (optval == -1) {
 							INP_WLOCK(inp);
 							inp->inp_flags2 &=
 							    ~(INP_2PCP_SET |
 							    INP_2PCP_MASK);
 							INP_WUNLOCK(inp);
 						} else {
 							INP_WLOCK(inp);
 							inp->inp_flags2 |=
 							    INP_2PCP_SET;
 							inp->inp_flags2 &=
 							    ~INP_2PCP_MASK;
 							inp->inp_flags2 |=
 							    optval <<
 							    INP_2PCP_SHIFT;
 							INP_WUNLOCK(inp);
 						}
 					} else
 						error = EINVAL;
 					break;
 				}
 				break;
 
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				if (optlen != sizeof(optval)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				{
 					struct ip6_pktopts **optp;
 					INP_WLOCK(inp);
 					if (inp->inp_flags & INP_DROPPED) {
 						INP_WUNLOCK(inp);
 						return (ECONNRESET);
 					}
 					optp = &inp->in6p_outputopts;
 					error = ip6_pcbopt(optname,
 					    (u_char *)&optval, sizeof(optval),
 					    optp, (td != NULL) ? td->td_ucred :
 					    NULL, uproto);
 					INP_WUNLOCK(inp);
 					break;
 				}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292DSTOPTS:
 			case IPV6_2292RTHDR:
 				/* RFC 2292 */
 				if (optlen != sizeof(int)) {
 					error = EINVAL;
 					break;
 				}
 				error = sooptcopyin(sopt, &optval,
 					sizeof optval, sizeof optval);
 				if (error)
 					break;
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					OPTSET2292(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					OPTSET2292(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					/*
 					 * Check super-user privilege.
 					 * See comments for IPV6_RECVHOPOPTS.
 					 */
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292DSTOPTS:
 					if (td != NULL) {
 						error = priv_check(td,
 						    PRIV_NETINET_SETHDROPTS);
 						if (error)
 							return (error);
 					}
 					OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
 					break;
 				case IPV6_2292RTHDR:
 					OPTSET2292(IN6P_RTHDR);
 					break;
 				}
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			{
 				/* new advanced API (RFC3542) */
 				u_char *optbuf;
 				u_char optbuf_storage[MCLBYTES];
 				int optlen;
 				struct ip6_pktopts **optp;
 
 				/* cannot mix with RFC2292 */
 				if (OPTBIT(IN6P_RFC2292)) {
 					error = EINVAL;
 					break;
 				}
 
 				/*
 				 * We only ensure valsize is not too large
 				 * here.  Further validation will be done
 				 * later.
 				 */
 				error = sooptcopyin(sopt, optbuf_storage,
 				    sizeof(optbuf_storage), 0);
 				if (error)
 					break;
 				optlen = sopt->sopt_valsize;
 				optbuf = optbuf_storage;
 				INP_WLOCK(inp);
 				if (inp->inp_flags & INP_DROPPED) {
 					INP_WUNLOCK(inp);
 					return (ECONNRESET);
 				}
 				optp = &inp->in6p_outputopts;
 				error = ip6_pcbopt(optname, optbuf, optlen,
 				    optp, (td != NULL) ? td->td_ucred : NULL,
 				    uproto);
 				INP_WUNLOCK(inp);
 				break;
 			}
 #undef OPTSET
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_JOIN_GROUP:
 			case IPV6_LEAVE_GROUP:
 			case IPV6_MSFILTER:
 			case MCAST_BLOCK_SOURCE:
 			case MCAST_UNBLOCK_SOURCE:
 			case MCAST_JOIN_GROUP:
 			case MCAST_LEAVE_GROUP:
 			case MCAST_JOIN_SOURCE_GROUP:
 			case MCAST_LEAVE_SOURCE_GROUP:
 				error = ip6_setmoptions(inp, sopt);
 				break;
 
 			case IPV6_PORTRANGE:
 				error = sooptcopyin(sopt, &optval,
 				    sizeof optval, sizeof optval);
 				if (error)
 					break;
 
 				INP_WLOCK(inp);
 				switch (optval) {
 				case IPV6_PORTRANGE_DEFAULT:
 					inp->inp_flags &= ~(INP_LOWPORT);
 					inp->inp_flags &= ~(INP_HIGHPORT);
 					break;
 
 				case IPV6_PORTRANGE_HIGH:
 					inp->inp_flags &= ~(INP_LOWPORT);
 					inp->inp_flags |= INP_HIGHPORT;
 					break;
 
 				case IPV6_PORTRANGE_LOW:
 					inp->inp_flags &= ~(INP_HIGHPORT);
 					inp->inp_flags |= INP_LOWPORT;
 					break;
 
 				default:
 					error = EINVAL;
 					break;
 				}
 				INP_WUNLOCK(inp);
 				break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			case IPV6_IPSEC_POLICY:
 				if (IPSEC_ENABLED(ipv6)) {
 					error = IPSEC_PCBCTL(ipv6, inp, sopt);
 					break;
 				}
 				/* FALLTHROUGH */
 #endif /* IPSEC */
 
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 
 		case SOPT_GET:
 			switch (optname) {
 			case IPV6_2292PKTOPTIONS:
 #ifdef IPV6_PKTOPTIONS
 			case IPV6_PKTOPTIONS:
 #endif
 				/*
 				 * RFC3542 (effectively) deprecated the
 				 * semantics of the 2292-style pktoptions.
 				 * Since it was not reliable in nature (i.e.,
 				 * applications had to expect the lack of some
 				 * information after all), it would make sense
 				 * to simplify this part by always returning
 				 * empty data.
 				 */
 				sopt->sopt_valsize = 0;
 				break;
 
 			case IPV6_RECVHOPOPTS:
 			case IPV6_RECVDSTOPTS:
 			case IPV6_RECVRTHDRDSTOPTS:
 			case IPV6_UNICAST_HOPS:
 			case IPV6_RECVPKTINFO:
 			case IPV6_RECVHOPLIMIT:
 			case IPV6_RECVRTHDR:
 			case IPV6_RECVPATHMTU:
 
 			case IPV6_V6ONLY:
 			case IPV6_PORTRANGE:
 			case IPV6_RECVTCLASS:
 			case IPV6_AUTOFLOWLABEL:
 			case IPV6_BINDANY:
 			case IPV6_FLOWID:
 			case IPV6_FLOWTYPE:
 			case IPV6_RECVFLOWID:
 #ifdef	RSS
 			case IPV6_RSSBUCKETID:
 			case IPV6_RECVRSSBUCKETID:
 #endif
 			case IPV6_BINDMULTI:
 			case IPV6_VLAN_PCP:
 				switch (optname) {
 				case IPV6_RECVHOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 
 				case IPV6_RECVDSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS);
 					break;
 
 				case IPV6_RECVRTHDRDSTOPTS:
 					optval = OPTBIT(IN6P_RTHDRDSTOPTS);
 					break;
 
 				case IPV6_UNICAST_HOPS:
 					optval = inp->in6p_hops;
 					break;
 
 				case IPV6_RECVPKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 
 				case IPV6_RECVHOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 
 				case IPV6_RECVRTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 
 				case IPV6_RECVPATHMTU:
 					optval = OPTBIT(IN6P_MTU);
 					break;
 
 				case IPV6_V6ONLY:
 					optval = OPTBIT(IN6P_IPV6_V6ONLY);
 					break;
 
 				case IPV6_PORTRANGE:
 				    {
 					int flags;
 					flags = inp->inp_flags;
 					if (flags & INP_HIGHPORT)
 						optval = IPV6_PORTRANGE_HIGH;
 					else if (flags & INP_LOWPORT)
 						optval = IPV6_PORTRANGE_LOW;
 					else
 						optval = 0;
 					break;
 				    }
 				case IPV6_RECVTCLASS:
 					optval = OPTBIT(IN6P_TCLASS);
 					break;
 
 				case IPV6_AUTOFLOWLABEL:
 					optval = OPTBIT(IN6P_AUTOFLOWLABEL);
 					break;
 
 				case IPV6_ORIGDSTADDR:
 					optval = OPTBIT2(INP_ORIGDSTADDR);
 					break;
 
 				case IPV6_BINDANY:
 					optval = OPTBIT(INP_BINDANY);
 					break;
 
 				case IPV6_FLOWID:
 					optval = inp->inp_flowid;
 					break;
 
 				case IPV6_FLOWTYPE:
 					optval = inp->inp_flowtype;
 					break;
 
 				case IPV6_RECVFLOWID:
 					optval = OPTBIT2(INP_RECVFLOWID);
 					break;
 #ifdef	RSS
 				case IPV6_RSSBUCKETID:
 					retval =
 					    rss_hash2bucket(inp->inp_flowid,
 					    inp->inp_flowtype,
 					    &rss_bucket);
 					if (retval == 0)
 						optval = rss_bucket;
 					else
 						error = EINVAL;
 					break;
 
 				case IPV6_RECVRSSBUCKETID:
 					optval = OPTBIT2(INP_RECVRSSBUCKETID);
 					break;
 #endif
 
 				case IPV6_BINDMULTI:
 					optval = OPTBIT2(INP_BINDMULTI);
 					break;
 
 				case IPV6_VLAN_PCP:
 					if (OPTBIT2(INP_2PCP_SET)) {
 						optval = (inp->inp_flags2 &
 							    INP_2PCP_MASK) >>
 							    INP_2PCP_SHIFT;
 					} else {
 						optval = -1;
 					}
 					break;
 				}
 
 				if (error)
 					break;
 				error = sooptcopyout(sopt, &optval,
 					sizeof optval);
 				break;
 
 			case IPV6_PATHMTU:
 			{
 				u_long pmtu = 0;
 				struct ip6_mtuinfo mtuinfo;
 				struct in6_addr addr;
 
 				if (!(so->so_state & SS_ISCONNECTED))
 					return (ENOTCONN);
 				/*
 				 * XXX: we dot not consider the case of source
 				 * routing, or optional information to specify
 				 * the outgoing interface.
 				 * Copy faddr out of inp to avoid holding lock
 				 * on inp during route lookup.
 				 */
 				INP_RLOCK(inp);
 				bcopy(&inp->in6p_faddr, &addr, sizeof(addr));
 				INP_RUNLOCK(inp);
 				error = ip6_getpmtu_ctl(so->so_fibnum,
 				    &addr, &pmtu);
 				if (error)
 					break;
 				if (pmtu > IPV6_MAXPACKET)
 					pmtu = IPV6_MAXPACKET;
 
 				bzero(&mtuinfo, sizeof(mtuinfo));
 				mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
 				optdata = (void *)&mtuinfo;
 				optdatalen = sizeof(mtuinfo);
 				error = sooptcopyout(sopt, optdata,
 				    optdatalen);
 				break;
 			}
 
 			case IPV6_2292PKTINFO:
 			case IPV6_2292HOPLIMIT:
 			case IPV6_2292HOPOPTS:
 			case IPV6_2292RTHDR:
 			case IPV6_2292DSTOPTS:
 				switch (optname) {
 				case IPV6_2292PKTINFO:
 					optval = OPTBIT(IN6P_PKTINFO);
 					break;
 				case IPV6_2292HOPLIMIT:
 					optval = OPTBIT(IN6P_HOPLIMIT);
 					break;
 				case IPV6_2292HOPOPTS:
 					optval = OPTBIT(IN6P_HOPOPTS);
 					break;
 				case IPV6_2292RTHDR:
 					optval = OPTBIT(IN6P_RTHDR);
 					break;
 				case IPV6_2292DSTOPTS:
 					optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
 					break;
 				}
 				error = sooptcopyout(sopt, &optval,
 				    sizeof optval);
 				break;
 			case IPV6_PKTINFO:
 			case IPV6_HOPOPTS:
 			case IPV6_RTHDR:
 			case IPV6_DSTOPTS:
 			case IPV6_RTHDRDSTOPTS:
 			case IPV6_NEXTHOP:
 			case IPV6_TCLASS:
 			case IPV6_DONTFRAG:
 			case IPV6_USE_MIN_MTU:
 			case IPV6_PREFER_TEMPADDR:
 				error = ip6_getpcbopt(inp, optname, sopt);
 				break;
 
 			case IPV6_MULTICAST_IF:
 			case IPV6_MULTICAST_HOPS:
 			case IPV6_MULTICAST_LOOP:
 			case IPV6_MSFILTER:
 				error = ip6_getmoptions(inp, sopt);
 				break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			case IPV6_IPSEC_POLICY:
 				if (IPSEC_ENABLED(ipv6)) {
 					error = IPSEC_PCBCTL(ipv6, inp, sopt);
 					break;
 				}
 				/* FALLTHROUGH */
 #endif /* IPSEC */
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			break;
 		}
 	}
 	return (error);
 }
 
 int
 ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int error = 0, optval, optlen;
 	const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
 	struct inpcb *inp = sotoinpcb(so);
 	int level, op, optname;
 
 	level = sopt->sopt_level;
 	op = sopt->sopt_dir;
 	optname = sopt->sopt_name;
 	optlen = sopt->sopt_valsize;
 
 	if (level != IPPROTO_IPV6) {
 		return (EINVAL);
 	}
 
 	switch (optname) {
 	case IPV6_CHECKSUM:
 		/*
 		 * For ICMPv6 sockets, no modification allowed for checksum
 		 * offset, permit "no change" values to help existing apps.
 		 *
 		 * RFC3542 says: "An attempt to set IPV6_CHECKSUM
 		 * for an ICMPv6 socket will fail."
 		 * The current behavior does not meet RFC3542.
 		 */
 		switch (op) {
 		case SOPT_SET:
 			if (optlen != sizeof(int)) {
 				error = EINVAL;
 				break;
 			}
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 					    sizeof(optval));
 			if (error)
 				break;
 			if (optval < -1 || (optval % 2) != 0) {
 				/*
 				 * The API assumes non-negative even offset
 				 * values or -1 as a special value.
 				 */
 				error = EINVAL;
 			} else if (inp->inp_ip_p == IPPROTO_ICMPV6) {
 				if (optval != icmp6off)
 					error = EINVAL;
 			} else
 				inp->in6p_cksum = optval;
 			break;
 
 		case SOPT_GET:
 			if (inp->inp_ip_p == IPPROTO_ICMPV6)
 				optval = icmp6off;
 			else
 				optval = inp->in6p_cksum;
 
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 
 	default:
 		error = ENOPROTOOPT;
 		break;
 	}
 
 	return (error);
 }
 
 /*
  * Set up IP6 options in pcb for insertion in output packets or
  * specifying behavior of outgoing packets.
  */
 static int
 ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m,
     struct socket *so, struct sockopt *sopt)
 {
 	struct ip6_pktopts *opt = *pktopt;
 	int error = 0;
 	struct thread *td = sopt->sopt_td;
 	struct epoch_tracker et;
 
 	/* turn off any old options. */
 	if (opt) {
 #ifdef DIAGNOSTIC
 		if (opt->ip6po_pktinfo || opt->ip6po_nexthop ||
 		    opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
 		    opt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			printf("ip6_pcbopts: all specified options are cleared.\n");
 #endif
 		ip6_clearpktopts(opt, -1);
 	} else {
 		opt = malloc(sizeof(*opt), M_IP6OPT, M_NOWAIT);
 		if (opt == NULL)
 			return (ENOMEM);
 	}
 	*pktopt = NULL;
 
 	if (!m || m->m_len == 0) {
 		/*
 		 * Only turning off any previous options, regardless of
 		 * whether the opt is just created or given.
 		 */
 		free(opt, M_IP6OPT);
 		return (0);
 	}
 
 	/*  set options specified by user. */
 	NET_EPOCH_ENTER(et);
 	if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ?
 	    td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) {
 		ip6_clearpktopts(opt, -1); /* XXX: discard all options */
 		free(opt, M_IP6OPT);
 		NET_EPOCH_EXIT(et);
 		return (error);
 	}
 	NET_EPOCH_EXIT(et);
 	*pktopt = opt;
 	return (0);
 }
 
 /*
  * initialize ip6_pktopts.  beware that there are non-zero default values in
  * the struct.
  */
 void
 ip6_initpktopts(struct ip6_pktopts *opt)
 {
 
 	bzero(opt, sizeof(*opt));
 	opt->ip6po_hlim = -1;	/* -1 means default hop limit */
 	opt->ip6po_tclass = -1;	/* -1 means default traffic class */
 	opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
 	opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
 }
 
 static int
 ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
     struct ucred *cred, int uproto)
 {
 	struct epoch_tracker et;
 	struct ip6_pktopts *opt;
 	int ret;
 
 	if (*pktopt == NULL) {
 		*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
 		    M_NOWAIT);
 		if (*pktopt == NULL)
 			return (ENOBUFS);
 		ip6_initpktopts(*pktopt);
 	}
 	opt = *pktopt;
 
 	NET_EPOCH_ENTER(et);
 	ret = ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto);
 	NET_EPOCH_EXIT(et);
 
 	return (ret);
 }
 
 #define GET_PKTOPT_VAR(field, lenexpr) do {				\
 	if (pktopt && pktopt->field) {					\
 		INP_RUNLOCK(inp);					\
 		optdata = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK);	\
 		malloc_optdata = true;					\
 		INP_RLOCK(inp);						\
 		if (inp->inp_flags & INP_DROPPED) {			\
 			INP_RUNLOCK(inp);				\
 			free(optdata, M_TEMP);				\
 			return (ECONNRESET);				\
 		}							\
 		pktopt = inp->in6p_outputopts;				\
 		if (pktopt && pktopt->field) {				\
 			optdatalen = min(lenexpr, sopt->sopt_valsize);	\
 			bcopy(pktopt->field, optdata, optdatalen);	\
 		} else {						\
 			free(optdata, M_TEMP);				\
 			optdata = NULL;					\
 			malloc_optdata = false;				\
 		}							\
 	}								\
 } while(0)
 
 #define GET_PKTOPT_EXT_HDR(field) GET_PKTOPT_VAR(field,			\
 	(((struct ip6_ext *)pktopt->field)->ip6e_len + 1) << 3)
 
 #define GET_PKTOPT_SOCKADDR(field) GET_PKTOPT_VAR(field,		\
 	pktopt->field->sa_len)
 
 static int
 ip6_getpcbopt(struct inpcb *inp, int optname, struct sockopt *sopt)
 {
 	void *optdata = NULL;
 	bool malloc_optdata = false;
 	int optdatalen = 0;
 	int error = 0;
 	struct in6_pktinfo null_pktinfo;
 	int deftclass = 0, on;
 	int defminmtu = IP6PO_MINMTU_MCASTONLY;
 	int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
 	struct ip6_pktopts *pktopt;
 
 	INP_RLOCK(inp);
 	pktopt = inp->in6p_outputopts;
 
 	switch (optname) {
 	case IPV6_PKTINFO:
 		optdata = (void *)&null_pktinfo;
 		if (pktopt && pktopt->ip6po_pktinfo) {
 			bcopy(pktopt->ip6po_pktinfo, &null_pktinfo,
 			    sizeof(null_pktinfo));
 			in6_clearscope(&null_pktinfo.ipi6_addr);
 		} else {
 			/* XXX: we don't have to do this every time... */
 			bzero(&null_pktinfo, sizeof(null_pktinfo));
 		}
 		optdatalen = sizeof(struct in6_pktinfo);
 		break;
 	case IPV6_TCLASS:
 		if (pktopt && pktopt->ip6po_tclass >= 0)
 			deftclass = pktopt->ip6po_tclass;
 		optdata = (void *)&deftclass;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_HOPOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_hbh);
 		break;
 	case IPV6_RTHDR:
 		GET_PKTOPT_EXT_HDR(ip6po_rthdr);
 		break;
 	case IPV6_RTHDRDSTOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_dest1);
 		break;
 	case IPV6_DSTOPTS:
 		GET_PKTOPT_EXT_HDR(ip6po_dest2);
 		break;
 	case IPV6_NEXTHOP:
 		GET_PKTOPT_SOCKADDR(ip6po_nexthop);
 		break;
 	case IPV6_USE_MIN_MTU:
 		if (pktopt)
 			defminmtu = pktopt->ip6po_minmtu;
 		optdata = (void *)&defminmtu;
 		optdatalen = sizeof(int);
 		break;
 	case IPV6_DONTFRAG:
 		if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
 			on = 1;
 		else
 			on = 0;
 		optdata = (void *)&on;
 		optdatalen = sizeof(on);
 		break;
 	case IPV6_PREFER_TEMPADDR:
 		if (pktopt)
 			defpreftemp = pktopt->ip6po_prefer_tempaddr;
 		optdata = (void *)&defpreftemp;
 		optdatalen = sizeof(int);
 		break;
 	default:		/* should not happen */
 #ifdef DIAGNOSTIC
 		panic("ip6_getpcbopt: unexpected option\n");
 #endif
 		INP_RUNLOCK(inp);
 		return (ENOPROTOOPT);
 	}
 	INP_RUNLOCK(inp);
 
 	error = sooptcopyout(sopt, optdata, optdatalen);
 	if (malloc_optdata)
 		free(optdata, M_TEMP);
 
 	return (error);
 }
 
 void
 ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
 {
 	if (pktopt == NULL)
 		return;
 
 	if (optname == -1 || optname == IPV6_PKTINFO) {
 		if (pktopt->ip6po_pktinfo)
 			free(pktopt->ip6po_pktinfo, M_IP6OPT);
 		pktopt->ip6po_pktinfo = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPLIMIT)
 		pktopt->ip6po_hlim = -1;
 	if (optname == -1 || optname == IPV6_TCLASS)
 		pktopt->ip6po_tclass = -1;
 	if (optname == -1 || optname == IPV6_NEXTHOP) {
 		if (pktopt->ip6po_nextroute.ro_nh) {
 			NH_FREE(pktopt->ip6po_nextroute.ro_nh);
 			pktopt->ip6po_nextroute.ro_nh = NULL;
 		}
 		if (pktopt->ip6po_nexthop)
 			free(pktopt->ip6po_nexthop, M_IP6OPT);
 		pktopt->ip6po_nexthop = NULL;
 	}
 	if (optname == -1 || optname == IPV6_HOPOPTS) {
 		if (pktopt->ip6po_hbh)
 			free(pktopt->ip6po_hbh, M_IP6OPT);
 		pktopt->ip6po_hbh = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
 		if (pktopt->ip6po_dest1)
 			free(pktopt->ip6po_dest1, M_IP6OPT);
 		pktopt->ip6po_dest1 = NULL;
 	}
 	if (optname == -1 || optname == IPV6_RTHDR) {
 		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
 			free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
 		pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
 		if (pktopt->ip6po_route.ro_nh) {
 			NH_FREE(pktopt->ip6po_route.ro_nh);
 			pktopt->ip6po_route.ro_nh = NULL;
 		}
 	}
 	if (optname == -1 || optname == IPV6_DSTOPTS) {
 		if (pktopt->ip6po_dest2)
 			free(pktopt->ip6po_dest2, M_IP6OPT);
 		pktopt->ip6po_dest2 = NULL;
 	}
 }
 
 #define PKTOPT_EXTHDRCPY(type) \
 do {\
 	if (src->type) {\
 		int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
 		dst->type = malloc(hlen, M_IP6OPT, canwait);\
 		if (dst->type == NULL)\
 			goto bad;\
 		bcopy(src->type, dst->type, hlen);\
 	}\
 } while (/*CONSTCOND*/ 0)
 
 static int
 copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
 {
 	if (dst == NULL || src == NULL)  {
 		printf("ip6_clearpktopts: invalid argument\n");
 		return (EINVAL);
 	}
 
 	dst->ip6po_hlim = src->ip6po_hlim;
 	dst->ip6po_tclass = src->ip6po_tclass;
 	dst->ip6po_flags = src->ip6po_flags;
 	dst->ip6po_minmtu = src->ip6po_minmtu;
 	dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
 	if (src->ip6po_pktinfo) {
 		dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_pktinfo == NULL)
 			goto bad;
 		*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
 	}
 	if (src->ip6po_nexthop) {
 		dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
 		    M_IP6OPT, canwait);
 		if (dst->ip6po_nexthop == NULL)
 			goto bad;
 		bcopy(src->ip6po_nexthop, dst->ip6po_nexthop,
 		    src->ip6po_nexthop->sa_len);
 	}
 	PKTOPT_EXTHDRCPY(ip6po_hbh);
 	PKTOPT_EXTHDRCPY(ip6po_dest1);
 	PKTOPT_EXTHDRCPY(ip6po_dest2);
 	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
 	return (0);
 
   bad:
 	ip6_clearpktopts(dst, -1);
 	return (ENOBUFS);
 }
 #undef PKTOPT_EXTHDRCPY
 
 struct ip6_pktopts *
 ip6_copypktopts(struct ip6_pktopts *src, int canwait)
 {
 	int error;
 	struct ip6_pktopts *dst;
 
 	dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
 	if (dst == NULL)
 		return (NULL);
 	ip6_initpktopts(dst);
 
 	if ((error = copypktopts(dst, src, canwait)) != 0) {
 		free(dst, M_IP6OPT);
 		return (NULL);
 	}
 
 	return (dst);
 }
 
 void
 ip6_freepcbopts(struct ip6_pktopts *pktopt)
 {
 	if (pktopt == NULL)
 		return;
 
 	ip6_clearpktopts(pktopt, -1);
 
 	free(pktopt, M_IP6OPT);
 }
 
 /*
  * Set IPv6 outgoing packet options based on advanced API.
  */
 int
 ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
     struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto)
 {
 	struct cmsghdr *cm = NULL;
 
 	if (control == NULL || opt == NULL)
 		return (EINVAL);
 
 	/*
 	 * ip6_setpktopt can call ifnet_byindex(), so it's imperative that we
 	 * are in the network epoch here.
 	 */
 	NET_EPOCH_ASSERT();
 
 	ip6_initpktopts(opt);
 	if (stickyopt) {
 		int error;
 
 		/*
 		 * If stickyopt is provided, make a local copy of the options
 		 * for this particular packet, then override them by ancillary
 		 * objects.
 		 * XXX: copypktopts() does not copy the cached route to a next
 		 * hop (if any).  This is not very good in terms of efficiency,
 		 * but we can allow this since this option should be rarely
 		 * used.
 		 */
 		if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * XXX: Currently, we assume all the optional information is stored
 	 * in a single mbuf.
 	 */
 	if (control->m_next)
 		return (EINVAL);
 
 	for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
 	    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 		int error;
 
 		if (control->m_len < CMSG_LEN(0))
 			return (EINVAL);
 
 		cm = mtod(control, struct cmsghdr *);
 		if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len)
 			return (EINVAL);
 		if (cm->cmsg_level != IPPROTO_IPV6)
 			continue;
 
 		error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
 		    cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Set a particular packet option, as a sticky option or an ancillary data
  * item.  "len" can be 0 only when it's a sticky option.
  * We have 4 cases of combination of "sticky" and "cmsg":
  * "sticky=0, cmsg=0": impossible
  * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
  * "sticky=1, cmsg=0": RFC3542 socket option
  * "sticky=1, cmsg=1": RFC2292 socket option
  */
 static int
 ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
     struct ucred *cred, int sticky, int cmsg, int uproto)
 {
 	int minmtupolicy, preftemp;
 	int error;
 
 	NET_EPOCH_ASSERT();
 
 	if (!sticky && !cmsg) {
 #ifdef DIAGNOSTIC
 		printf("ip6_setpktopt: impossible case\n");
 #endif
 		return (EINVAL);
 	}
 
 	/*
 	 * IPV6_2292xxx is for backward compatibility to RFC2292, and should
 	 * not be specified in the context of RFC3542.  Conversely,
 	 * RFC3542 types should not be specified in the context of RFC2292.
 	 */
 	if (!cmsg) {
 		switch (optname) {
 		case IPV6_2292PKTINFO:
 		case IPV6_2292HOPLIMIT:
 		case IPV6_2292NEXTHOP:
 		case IPV6_2292HOPOPTS:
 		case IPV6_2292DSTOPTS:
 		case IPV6_2292RTHDR:
 		case IPV6_2292PKTOPTIONS:
 			return (ENOPROTOOPT);
 		}
 	}
 	if (sticky && cmsg) {
 		switch (optname) {
 		case IPV6_PKTINFO:
 		case IPV6_HOPLIMIT:
 		case IPV6_NEXTHOP:
 		case IPV6_HOPOPTS:
 		case IPV6_DSTOPTS:
 		case IPV6_RTHDRDSTOPTS:
 		case IPV6_RTHDR:
 		case IPV6_USE_MIN_MTU:
 		case IPV6_DONTFRAG:
 		case IPV6_TCLASS:
 		case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */
 			return (ENOPROTOOPT);
 		}
 	}
 
 	switch (optname) {
 	case IPV6_2292PKTINFO:
 	case IPV6_PKTINFO:
 	{
 		struct ifnet *ifp = NULL;
 		struct in6_pktinfo *pktinfo;
 
 		if (len != sizeof(struct in6_pktinfo))
 			return (EINVAL);
 
 		pktinfo = (struct in6_pktinfo *)buf;
 
 		/*
 		 * An application can clear any sticky IPV6_PKTINFO option by
 		 * doing a "regular" setsockopt with ipi6_addr being
 		 * in6addr_any and ipi6_ifindex being zero.
 		 * [RFC 3542, Section 6]
 		 */
 		if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo &&
 		    pktinfo->ipi6_ifindex == 0 &&
 		    IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			ip6_clearpktopts(opt, optname);
 			break;
 		}
 
 		if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO &&
 		    sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			return (EINVAL);
 		}
 		if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr))
 			return (EINVAL);
 		/* validate the interface index if specified. */
 		if (pktinfo->ipi6_ifindex) {
 			ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
 			if (ifp == NULL)
 				return (ENXIO);
 		}
 		if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL ||
 		    (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0))
 			return (ENETDOWN);
 
 		if (ifp != NULL &&
 		    !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 			struct in6_ifaddr *ia;
 
 			in6_setscope(&pktinfo->ipi6_addr, ifp, NULL);
 			ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr);
 			if (ia == NULL)
 				return (EADDRNOTAVAIL);
 			ifa_free(&ia->ia_ifa);
 		}
 		/*
 		 * We store the address anyway, and let in6_selectsrc()
 		 * validate the specified address.  This is because ipi6_addr
 		 * may not have enough information about its scope zone, and
 		 * we may need additional information (such as outgoing
 		 * interface or the scope zone of a destination address) to
 		 * disambiguate the scope.
 		 * XXX: the delay of the validation may confuse the
 		 * application when it is used as a sticky option.
 		 */
 		if (opt->ip6po_pktinfo == NULL) {
 			opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
 			    M_IP6OPT, M_NOWAIT);
 			if (opt->ip6po_pktinfo == NULL)
 				return (ENOBUFS);
 		}
 		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
 		break;
 	}
 
 	case IPV6_2292HOPLIMIT:
 	case IPV6_HOPLIMIT:
 	{
 		int *hlimp;
 
 		/*
 		 * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
 		 * to simplify the ordering among hoplimit options.
 		 */
 		if (optname == IPV6_HOPLIMIT && sticky)
 			return (ENOPROTOOPT);
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		hlimp = (int *)buf;
 		if (*hlimp < -1 || *hlimp > 255)
 			return (EINVAL);
 
 		opt->ip6po_hlim = *hlimp;
 		break;
 	}
 
 	case IPV6_TCLASS:
 	{
 		int tclass;
 
 		if (len != sizeof(int))
 			return (EINVAL);
 		tclass = *(int *)buf;
 		if (tclass < -1 || tclass > 255)
 			return (EINVAL);
 
 		opt->ip6po_tclass = tclass;
 		break;
 	}
 
 	case IPV6_2292NEXTHOP:
 	case IPV6_NEXTHOP:
 		if (cred != NULL) {
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {	/* just remove the option */
 			ip6_clearpktopts(opt, IPV6_NEXTHOP);
 			break;
 		}
 
 		/* check if cmsg_len is large enough for sa_len */
 		if (len < sizeof(struct sockaddr) || len < *buf)
 			return (EINVAL);
 
 		switch (((struct sockaddr *)buf)->sa_family) {
 		case AF_INET6:
 		{
 			struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
 			int error;
 
 			if (sa6->sin6_len != sizeof(struct sockaddr_in6))
 				return (EINVAL);
 
 			if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
 			    IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
 				return (EINVAL);
 			}
 			if ((error = sa6_embedscope(sa6, V_ip6_use_defzone))
 			    != 0) {
 				return (error);
 			}
 			break;
 		}
 		case AF_LINK:	/* should eventually be supported */
 		default:
 			return (EAFNOSUPPORT);
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_NEXTHOP);
 		opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_nexthop == NULL)
 			return (ENOBUFS);
 		bcopy(buf, opt->ip6po_nexthop, *buf);
 		break;
 
 	case IPV6_2292HOPOPTS:
 	case IPV6_HOPOPTS:
 	{
 		struct ip6_hbh *hbh;
 		int hbhlen;
 
 		/*
 		 * XXX: We don't allow a non-privileged user to set ANY HbH
 		 * options, since per-option restriction has too much
 		 * overhead.
 		 */
 		if (cred != NULL) {
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_HOPOPTS);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_hbh))
 			return (EINVAL);
 		hbh = (struct ip6_hbh *)buf;
 		hbhlen = (hbh->ip6h_len + 1) << 3;
 		if (len != hbhlen)
 			return (EINVAL);
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, IPV6_HOPOPTS);
 		opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_hbh == NULL)
 			return (ENOBUFS);
 		bcopy(hbh, opt->ip6po_hbh, hbhlen);
 
 		break;
 	}
 
 	case IPV6_2292DSTOPTS:
 	case IPV6_DSTOPTS:
 	case IPV6_RTHDRDSTOPTS:
 	{
 		struct ip6_dest *dest, **newdest = NULL;
 		int destlen;
 
 		if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */
 			error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS);
 			if (error)
 				return (error);
 		}
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, optname);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_dest))
 			return (EINVAL);
 		dest = (struct ip6_dest *)buf;
 		destlen = (dest->ip6d_len + 1) << 3;
 		if (len != destlen)
 			return (EINVAL);
 
 		/*
 		 * Determine the position that the destination options header
 		 * should be inserted; before or after the routing header.
 		 */
 		switch (optname) {
 		case IPV6_2292DSTOPTS:
 			/*
 			 * The old advacned API is ambiguous on this point.
 			 * Our approach is to determine the position based
 			 * according to the existence of a routing header.
 			 * Note, however, that this depends on the order of the
 			 * extension headers in the ancillary data; the 1st
 			 * part of the destination options header must appear
 			 * before the routing header in the ancillary data,
 			 * too.
 			 * RFC3542 solved the ambiguity by introducing
 			 * separate ancillary data or option types.
 			 */
 			if (opt->ip6po_rthdr == NULL)
 				newdest = &opt->ip6po_dest1;
 			else
 				newdest = &opt->ip6po_dest2;
 			break;
 		case IPV6_RTHDRDSTOPTS:
 			newdest = &opt->ip6po_dest1;
 			break;
 		case IPV6_DSTOPTS:
 			newdest = &opt->ip6po_dest2;
 			break;
 		}
 
 		/* turn off the previous option, then set the new option. */
 		ip6_clearpktopts(opt, optname);
 		*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
 		if (*newdest == NULL)
 			return (ENOBUFS);
 		bcopy(dest, *newdest, destlen);
 
 		break;
 	}
 
 	case IPV6_2292RTHDR:
 	case IPV6_RTHDR:
 	{
 		struct ip6_rthdr *rth;
 		int rthlen;
 
 		if (len == 0) {
 			ip6_clearpktopts(opt, IPV6_RTHDR);
 			break;	/* just remove the option */
 		}
 
 		/* message length validation */
 		if (len < sizeof(struct ip6_rthdr))
 			return (EINVAL);
 		rth = (struct ip6_rthdr *)buf;
 		rthlen = (rth->ip6r_len + 1) << 3;
 		if (len != rthlen)
 			return (EINVAL);
 
 		switch (rth->ip6r_type) {
 		case IPV6_RTHDR_TYPE_0:
 			if (rth->ip6r_len == 0)	/* must contain one addr */
 				return (EINVAL);
 			if (rth->ip6r_len % 2) /* length must be even */
 				return (EINVAL);
 			if (rth->ip6r_len / 2 != rth->ip6r_segleft)
 				return (EINVAL);
 			break;
 		default:
 			return (EINVAL);	/* not supported */
 		}
 
 		/* turn off the previous option */
 		ip6_clearpktopts(opt, IPV6_RTHDR);
 		opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
 		if (opt->ip6po_rthdr == NULL)
 			return (ENOBUFS);
 		bcopy(rth, opt->ip6po_rthdr, rthlen);
 
 		break;
 	}
 
 	case IPV6_USE_MIN_MTU:
 		if (len != sizeof(int))
 			return (EINVAL);
 		minmtupolicy = *(int *)buf;
 		if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
 		    minmtupolicy != IP6PO_MINMTU_DISABLE &&
 		    minmtupolicy != IP6PO_MINMTU_ALL) {
 			return (EINVAL);
 		}
 		opt->ip6po_minmtu = minmtupolicy;
 		break;
 
 	case IPV6_DONTFRAG:
 		if (len != sizeof(int))
 			return (EINVAL);
 
 		if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
 			/*
 			 * we ignore this option for TCP sockets.
 			 * (RFC3542 leaves this case unspecified.)
 			 */
 			opt->ip6po_flags &= ~IP6PO_DONTFRAG;
 		} else
 			opt->ip6po_flags |= IP6PO_DONTFRAG;
 		break;
 
 	case IPV6_PREFER_TEMPADDR:
 		if (len != sizeof(int))
 			return (EINVAL);
 		preftemp = *(int *)buf;
 		if (preftemp != IP6PO_TEMPADDR_SYSTEM &&
 		    preftemp != IP6PO_TEMPADDR_NOTPREFER &&
 		    preftemp != IP6PO_TEMPADDR_PREFER) {
 			return (EINVAL);
 		}
 		opt->ip6po_prefer_tempaddr = preftemp;
 		break;
 
 	default:
 		return (ENOPROTOOPT);
 	} /* end of switch */
 
 	return (0);
 }
 
 /*
  * Routine called from ip6_output() to loop back a copy of an IP6 multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be &loif -- easier than replicating that code here.
  */
 void
 ip6_mloopback(struct ifnet *ifp, struct mbuf *m)
 {
 	struct mbuf *copym;
 	struct ip6_hdr *ip6;
 
 	copym = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 	if (copym == NULL)
 		return;
 
 	/*
 	 * Make sure to deep-copy IPv6 header portion in case the data
 	 * is in an mbuf cluster, so that we can safely override the IPv6
 	 * header portion later.
 	 */
 	if (!M_WRITABLE(copym) ||
 	    copym->m_len < sizeof(struct ip6_hdr)) {
 		copym = m_pullup(copym, sizeof(struct ip6_hdr));
 		if (copym == NULL)
 			return;
 	}
 	ip6 = mtod(copym, struct ip6_hdr *);
 	/*
 	 * clear embedded scope identifiers if necessary.
 	 * in6_clearscope will touch the addresses only when necessary.
 	 */
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 	if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 		copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 |
 		    CSUM_PSEUDO_HDR;
 		copym->m_pkthdr.csum_data = 0xffff;
 	}
 	if_simloop(ifp, copym, AF_INET6, 0);
 }
 
 /*
  * Chop IPv6 header off from the payload.
  */
 static int
 ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
 {
 	struct mbuf *mh;
 	struct ip6_hdr *ip6;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (m->m_len > sizeof(*ip6)) {
 		mh = m_gethdr(M_NOWAIT, MT_DATA);
 		if (mh == NULL) {
 			m_freem(m);
 			return ENOBUFS;
 		}
 		m_move_pkthdr(mh, m);
 		M_ALIGN(mh, sizeof(*ip6));
 		m->m_len -= sizeof(*ip6);
 		m->m_data += sizeof(*ip6);
 		mh->m_next = m;
 		m = mh;
 		m->m_len = sizeof(*ip6);
 		bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6));
 	}
 	exthdrs->ip6e_ip6 = m;
 	return 0;
 }
 
 /*
  * Compute IPv6 extension header length.
  */
 int
 ip6_optlen(struct inpcb *inp)
 {
 	int len;
 
 	if (!inp->in6p_outputopts)
 		return 0;
 
 	len = 0;
 #define elen(x) \
     (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
 
 	len += elen(inp->in6p_outputopts->ip6po_hbh);
 	if (inp->in6p_outputopts->ip6po_rthdr)
 		/* dest1 is valid with rthdr only */
 		len += elen(inp->in6p_outputopts->ip6po_dest1);
 	len += elen(inp->in6p_outputopts->ip6po_rthdr);
 	len += elen(inp->in6p_outputopts->ip6po_dest2);
 	return len;
 #undef elen
 }
diff --git a/sys/netinet6/mld6.c b/sys/netinet6/mld6.c
index f20348f653ef..63f4e33ac64d 100644
--- a/sys/netinet6/mld6.c
+++ b/sys/netinet6/mld6.c
@@ -1,3335 +1,3336 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2009 Bruce Simpson.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: mld6.c,v 1.27 2001/04/04 05:17:30 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1988 Stephen Deering.
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Stephen Deering of Stanford University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/ktr.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet/icmp6.h>
 #include <netinet6/mld6.h>
 #include <netinet6/mld6_var.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifndef KTR_MLD
 #define KTR_MLD KTR_INET6
 #endif
 
 static void	mli_delete_locked(struct ifnet *);
 static void	mld_dispatch_packet(struct mbuf *);
 static void	mld_dispatch_queue(struct mbufq *, int);
 static void	mld_final_leave(struct in6_multi *, struct mld_ifsoftc *);
 static void	mld_fasttimo_vnet(struct in6_multi_head *inmh);
 static int	mld_handle_state_change(struct in6_multi *,
 		    struct mld_ifsoftc *);
 static int	mld_initial_join(struct in6_multi *, struct mld_ifsoftc *,
 		    const int);
 #ifdef KTR
 static char *	mld_rec_type_to_str(const int);
 #endif
 static void	mld_set_version(struct mld_ifsoftc *, const int);
 static void	mld_slowtimo_vnet(void);
 static int	mld_v1_input_query(struct ifnet *, const struct ip6_hdr *,
 		    /*const*/ struct mld_hdr *);
 static int	mld_v1_input_report(struct ifnet *, const struct ip6_hdr *,
 		    /*const*/ struct mld_hdr *);
 static void	mld_v1_process_group_timer(struct in6_multi_head *,
 		    struct in6_multi *);
 static void	mld_v1_process_querier_timers(struct mld_ifsoftc *);
 static int	mld_v1_transmit_report(struct in6_multi *, const int);
 static void	mld_v1_update_group(struct in6_multi *, const int);
 static void	mld_v2_cancel_link_timers(struct mld_ifsoftc *);
 static void	mld_v2_dispatch_general_query(struct mld_ifsoftc *);
 static struct mbuf *
 		mld_v2_encap_report(struct ifnet *, struct mbuf *);
 static int	mld_v2_enqueue_filter_change(struct mbufq *,
 		    struct in6_multi *);
 static int	mld_v2_enqueue_group_record(struct mbufq *,
 		    struct in6_multi *, const int, const int, const int,
 		    const int);
 static int	mld_v2_input_query(struct ifnet *, const struct ip6_hdr *,
 		    struct mbuf *, struct mldv2_query *, const int, const int);
 static int	mld_v2_merge_state_changes(struct in6_multi *,
 		    struct mbufq *);
 static void	mld_v2_process_group_timers(struct in6_multi_head *,
 		    struct mbufq *, struct mbufq *,
 		    struct in6_multi *, const int);
 static int	mld_v2_process_group_query(struct in6_multi *,
 		    struct mld_ifsoftc *mli, int, struct mbuf *,
 		    struct mldv2_query *, const int);
 static int	sysctl_mld_gsr(SYSCTL_HANDLER_ARGS);
 static int	sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS);
 
 /*
  * Normative references: RFC 2710, RFC 3590, RFC 3810.
  *
  * Locking:
  *  * The MLD subsystem lock ends up being system-wide for the moment,
  *    but could be per-VIMAGE later on.
  *  * The permitted lock order is: IN6_MULTI_LOCK, MLD_LOCK, IF_ADDR_LOCK.
  *    Any may be taken independently; if any are held at the same
  *    time, the above lock order must be followed.
  *  * IN6_MULTI_LOCK covers in_multi.
  *  * MLD_LOCK covers per-link state and any global variables in this file.
  *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
  *    per-link state iterators.
  *
  *  XXX LOR PREVENTION
  *  A special case for IPv6 is the in6_setscope() routine. ip6_output()
  *  will not accept an ifp; it wants an embedded scope ID, unlike
  *  ip_output(), which happily takes the ifp given to it. The embedded
  *  scope ID is only used by MLD to select the outgoing interface.
  *
  *  During interface attach and detach, MLD will take MLD_LOCK *after*
  *  the IF_AFDATA_LOCK.
  *  As in6_setscope() takes IF_AFDATA_LOCK then SCOPE_LOCK, we can't call
  *  it with MLD_LOCK held without triggering an LOR. A netisr with indirect
  *  dispatch could work around this, but we'd rather not do that, as it
  *  can introduce other races.
  *
  *  As such, we exploit the fact that the scope ID is just the interface
  *  index, and embed it in the IPv6 destination address accordingly.
  *  This is potentially NOT VALID for MLDv1 reports, as they
  *  are always sent to the multicast group itself; as MLDv2
  *  reports are always sent to ff02::16, this is not an issue
  *  when MLDv2 is in use.
  *
  *  This does not however eliminate the LOR when ip6_output() itself
  *  calls in6_setscope() internally whilst MLD_LOCK is held. This will
  *  trigger a LOR warning in WITNESS when the ifnet is detached.
  *
  *  The right answer is probably to make IF_AFDATA_LOCK an rwlock, given
  *  how it's used across the network stack. Here we're simply exploiting
  *  the fact that MLD runs at a similar layer in the stack to scope6.c.
  *
  * VIMAGE:
  *  * Each in6_multi corresponds to an ifp, and each ifp corresponds
  *    to a vnet in ifp->if_vnet.
  */
 static struct mtx		 mld_mtx;
 static MALLOC_DEFINE(M_MLD, "mld", "mld state");
 
 #define	MLD_EMBEDSCOPE(pin6, zoneid)					\
 	if (IN6_IS_SCOPE_LINKLOCAL(pin6) ||				\
 	    IN6_IS_ADDR_MC_INTFACELOCAL(pin6))				\
 		(pin6)->s6_addr16[1] = htons((zoneid) & 0xFFFF)		\
 
 /*
  * VIMAGE-wide globals.
  */
 VNET_DEFINE_STATIC(struct timeval, mld_gsrdelay) = {10, 0};
 VNET_DEFINE_STATIC(LIST_HEAD(, mld_ifsoftc), mli_head);
 VNET_DEFINE_STATIC(int, interface_timers_running6);
 VNET_DEFINE_STATIC(int, state_change_timers_running6);
 VNET_DEFINE_STATIC(int, current_state_timers_running6);
 
 #define	V_mld_gsrdelay			VNET(mld_gsrdelay)
 #define	V_mli_head			VNET(mli_head)
 #define	V_interface_timers_running6	VNET(interface_timers_running6)
 #define	V_state_change_timers_running6	VNET(state_change_timers_running6)
 #define	V_current_state_timers_running6	VNET(current_state_timers_running6)
 
 SYSCTL_DECL(_net_inet6);	/* Note: Not in any common header. */
 
 SYSCTL_NODE(_net_inet6, OID_AUTO, mld, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPv6 Multicast Listener Discovery");
 
 /*
  * Virtualized sysctls.
  */
 SYSCTL_PROC(_net_inet6_mld, OID_AUTO, gsrdelay,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(mld_gsrdelay.tv_sec), 0, sysctl_mld_gsr, "I",
     "Rate limit for MLDv2 Group-and-Source queries in seconds");
 
 /*
  * Non-virtualized sysctls.
  */
 static SYSCTL_NODE(_net_inet6_mld, OID_AUTO, ifinfo,
     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mld_ifinfo,
     "Per-interface MLDv2 state");
 
 static int	mld_v1enable = 1;
 SYSCTL_INT(_net_inet6_mld, OID_AUTO, v1enable, CTLFLAG_RWTUN,
     &mld_v1enable, 0, "Enable fallback to MLDv1");
 
 static int	mld_v2enable = 1;
 SYSCTL_INT(_net_inet6_mld, OID_AUTO, v2enable, CTLFLAG_RWTUN,
     &mld_v2enable, 0, "Enable MLDv2");
 
 static int	mld_use_allow = 1;
 SYSCTL_INT(_net_inet6_mld, OID_AUTO, use_allow, CTLFLAG_RWTUN,
     &mld_use_allow, 0, "Use ALLOW/BLOCK for RFC 4604 SSM joins/leaves");
 
 /*
  * Packed Router Alert option structure declaration.
  */
 struct mld_raopt {
 	struct ip6_hbh		hbh;
 	struct ip6_opt		pad;
 	struct ip6_opt_router	ra;
 } __packed;
 
 /*
  * Router Alert hop-by-hop option header.
  */
 static struct mld_raopt mld_ra = {
 	.hbh = { 0, 0 },
 	.pad = { .ip6o_type = IP6OPT_PADN, 0 },
 	.ra = {
 	    .ip6or_type = IP6OPT_ROUTER_ALERT,
 	    .ip6or_len = IP6OPT_RTALERT_LEN - 2,
 	    .ip6or_value[0] = ((IP6OPT_RTALERT_MLD >> 8) & 0xFF),
 	    .ip6or_value[1] = (IP6OPT_RTALERT_MLD & 0xFF)
 	}
 };
 static struct ip6_pktopts mld_po;
 
 static __inline void
 mld_save_context(struct mbuf *m, struct ifnet *ifp)
 {
 
 #ifdef VIMAGE
 	m->m_pkthdr.PH_loc.ptr = ifp->if_vnet;
 #endif /* VIMAGE */
 	m->m_pkthdr.rcvif = ifp;
 	m->m_pkthdr.flowid = ifp->if_index;
 }
 
 static __inline void
 mld_scrub_context(struct mbuf *m)
 {
 
 	m->m_pkthdr.PH_loc.ptr = NULL;
 	m->m_pkthdr.flowid = 0;
 }
 
 /*
  * Restore context from a queued output chain.
  * Return saved ifindex.
  *
  * VIMAGE: The assertion is there to make sure that we
  * actually called CURVNET_SET() with what's in the mbuf chain.
  */
 static __inline uint32_t
 mld_restore_context(struct mbuf *m)
 {
 
 #if defined(VIMAGE) && defined(INVARIANTS)
 	KASSERT(curvnet == m->m_pkthdr.PH_loc.ptr,
 	    ("%s: called when curvnet was not restored: cuvnet %p m ptr %p",
 	    __func__, curvnet, m->m_pkthdr.PH_loc.ptr));
 #endif
 	return (m->m_pkthdr.flowid);
 }
 
 /*
  * Retrieve or set threshold between group-source queries in seconds.
  *
  * VIMAGE: Assume curvnet set by caller.
  * SMPng: NOTE: Serialized by MLD lock.
  */
 static int
 sysctl_mld_gsr(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error)
 		return (error);
 
 	MLD_LOCK();
 
 	i = V_mld_gsrdelay.tv_sec;
 
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		goto out_locked;
 
 	if (i < -1 || i >= 60) {
 		error = EINVAL;
 		goto out_locked;
 	}
 
 	CTR2(KTR_MLD, "change mld_gsrdelay from %d to %d",
 	     V_mld_gsrdelay.tv_sec, i);
 	V_mld_gsrdelay.tv_sec = i;
 
 out_locked:
 	MLD_UNLOCK();
 	return (error);
 }
 
 /*
  * Expose struct mld_ifsoftc to userland, keyed by ifindex.
  * For use by ifmcstat(8).
  *
  * VIMAGE: Assume curvnet set by caller. The node handler itself
  * is not directly virtualized.
  */
 static int
 sysctl_mld_ifinfo(SYSCTL_HANDLER_ARGS)
 {
 	struct epoch_tracker	 et;
 	int			*name;
 	int			 error;
 	u_int			 namelen;
 	struct ifnet		*ifp;
 	struct mld_ifsoftc	*mli;
 
 	name = (int *)arg1;
 	namelen = arg2;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (namelen != 1)
 		return (EINVAL);
 
 	error = sysctl_wire_old_buffer(req, sizeof(struct mld_ifinfo));
 	if (error)
 		return (error);
 
 	IN6_MULTI_LOCK();
 	IN6_MULTI_LIST_LOCK();
 	MLD_LOCK();
 	NET_EPOCH_ENTER(et);
 
 	error = ENOENT;
 	ifp = ifnet_byindex(name[0]);
 	if (ifp == NULL)
 		goto out_locked;
 
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		if (ifp == mli->mli_ifp) {
 			struct mld_ifinfo info;
 
 			info.mli_version = mli->mli_version;
 			info.mli_v1_timer = mli->mli_v1_timer;
 			info.mli_v2_timer = mli->mli_v2_timer;
 			info.mli_flags = mli->mli_flags;
 			info.mli_rv = mli->mli_rv;
 			info.mli_qi = mli->mli_qi;
 			info.mli_qri = mli->mli_qri;
 			info.mli_uri = mli->mli_uri;
 			error = SYSCTL_OUT(req, &info, sizeof(info));
 			break;
 		}
 	}
 
 out_locked:
 	NET_EPOCH_EXIT(et);
 	MLD_UNLOCK();
 	IN6_MULTI_LIST_UNLOCK();
 	IN6_MULTI_UNLOCK();
 	return (error);
 }
 
 /*
  * Dispatch an entire queue of pending packet chains.
  * VIMAGE: Assumes the vnet pointer has been set.
  */
 static void
 mld_dispatch_queue(struct mbufq *mq, int limit)
 {
 	struct mbuf *m;
 
 	while ((m = mbufq_dequeue(mq)) != NULL) {
 		CTR3(KTR_MLD, "%s: dispatch %p from %p", __func__, mq, m);
 		mld_dispatch_packet(m);
 		if (--limit == 0)
 			break;
 	}
 }
 
 /*
  * Filter outgoing MLD report state by group.
  *
  * Reports are ALWAYS suppressed for ALL-HOSTS (ff02::1)
  * and node-local addresses. However, kernel and socket consumers
  * always embed the KAME scope ID in the address provided, so strip it
  * when performing comparison.
  * Note: This is not the same as the *multicast* scope.
  *
  * Return zero if the given group is one for which MLD reports
  * should be suppressed, or non-zero if reports should be issued.
  */
 static __inline int
 mld_is_addr_reported(const struct in6_addr *addr)
 {
 
 	KASSERT(IN6_IS_ADDR_MULTICAST(addr), ("%s: not multicast", __func__));
 
 	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_NODELOCAL)
 		return (0);
 
 	if (IPV6_ADDR_MC_SCOPE(addr) == IPV6_ADDR_SCOPE_LINKLOCAL) {
 		struct in6_addr tmp = *addr;
 		in6_clearscope(&tmp);
 		if (IN6_ARE_ADDR_EQUAL(&tmp, &in6addr_linklocal_allnodes))
 			return (0);
 	}
 
 	return (1);
 }
 
 /*
  * Attach MLD when PF_INET6 is attached to an interface.  Assumes that the
  * current VNET is set by the caller.
  */
 struct mld_ifsoftc *
 mld_domifattach(struct ifnet *ifp)
 {
 	struct mld_ifsoftc *mli;
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp, if_name(ifp));
 
 	mli = malloc(sizeof(struct mld_ifsoftc), M_MLD, M_WAITOK | M_ZERO);
 	mli->mli_ifp = ifp;
 	mli->mli_version = MLD_VERSION_2;
 	mli->mli_flags = 0;
 	mli->mli_rv = MLD_RV_INIT;
 	mli->mli_qi = MLD_QI_INIT;
 	mli->mli_qri = MLD_QRI_INIT;
 	mli->mli_uri = MLD_URI_INIT;
 	mbufq_init(&mli->mli_gq, MLD_MAX_RESPONSE_PACKETS);
 	if ((ifp->if_flags & IFF_MULTICAST) == 0)
 		mli->mli_flags |= MLIF_SILENT;
 	if (mld_use_allow)
 		mli->mli_flags |= MLIF_USEALLOW;
 
 	MLD_LOCK();
 	LIST_INSERT_HEAD(&V_mli_head, mli, mli_link);
 	MLD_UNLOCK();
 
 	return (mli);
 }
 
 /*
  * Hook for ifdetach.
  *
  * NOTE: Some finalization tasks need to run before the protocol domain
  * is detached, but also before the link layer does its cleanup.
  * Run before link-layer cleanup; cleanup groups, but do not free MLD state.
  *
  * SMPng: Caller must hold IN6_MULTI_LOCK().
  * Must take IF_ADDR_LOCK() to cover if_multiaddrs iterator.
  * XXX This routine is also bitten by unlocked ifma_protospec access.
  */
 void
 mld_ifdetach(struct ifnet *ifp, struct in6_multi_head *inmh)
 {
 	struct epoch_tracker     et;
 	struct mld_ifsoftc	*mli;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm;
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)", __func__, ifp,
 	    if_name(ifp));
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	IF_ADDR_WLOCK(ifp);
 	/*
 	 * Extract list of in6_multi associated with the detaching ifp
 	 * which the PF_INET6 layer is about to release.
 	 */
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = in6m_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		in6m_disconnect_locked(inmh, inm);
 
 		if (mli->mli_version == MLD_VERSION_2) {
 			in6m_clear_recorded(inm);
 
 			/*
 			 * We need to release the final reference held
 			 * for issuing the INCLUDE {}.
 			 */
 			if (inm->in6m_state == MLD_LEAVING_MEMBER) {
 				inm->in6m_state = MLD_NOT_MEMBER;
 				in6m_rele_locked(inmh, inm);
 			}
 		}
 	}
 	NET_EPOCH_EXIT(et);
 	IF_ADDR_WUNLOCK(ifp);
 	MLD_UNLOCK();
 }
 
 /*
  * Hook for domifdetach.
  * Runs after link-layer cleanup; free MLD state.
  *
  * SMPng: Normally called with IF_AFDATA_LOCK held.
  */
 void
 mld_domifdetach(struct ifnet *ifp)
 {
 
 	CTR3(KTR_MLD, "%s: called for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK();
 	mli_delete_locked(ifp);
 	MLD_UNLOCK();
 }
 
 static void
 mli_delete_locked(struct ifnet *ifp)
 {
 	struct mld_ifsoftc *mli, *tmli;
 
 	CTR3(KTR_MLD, "%s: freeing mld_ifsoftc for ifp %p(%s)",
 	    __func__, ifp, if_name(ifp));
 
 	MLD_LOCK_ASSERT();
 
 	LIST_FOREACH_SAFE(mli, &V_mli_head, mli_link, tmli) {
 		if (mli->mli_ifp == ifp) {
 			/*
 			 * Free deferred General Query responses.
 			 */
 			mbufq_drain(&mli->mli_gq);
 
 			LIST_REMOVE(mli, mli_link);
 
 			free(mli, M_MLD);
 			return;
 		}
 	}
 }
 
 /*
  * Process a received MLDv1 general or address-specific query.
  * Assumes that the query header has been pulled up to sizeof(mld_hdr).
  *
  * NOTE: Can't be fully const correct as we temporarily embed scope ID in
  * mld_addr. This is OK as we own the mbuf chain.
  */
 static int
 mld_v1_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
     /*const*/ struct mld_hdr *mld)
 {
 	struct ifmultiaddr	*ifma;
 	struct mld_ifsoftc	*mli;
 	struct in6_multi	*inm;
 	int			 is_general_query;
 	uint16_t		 timer;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	NET_EPOCH_ASSERT();
 
 	is_general_query = 0;
 
 	if (!mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 query %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	/*
 	 * RFC3810 Section 6.2: MLD queries must originate from
 	 * a router's link-local address.
 	 */
 	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	/*
 	 * Do address field validation upfront before we accept
 	 * the query.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
 		/*
 		 * MLDv1 General Query.
 		 * If this was not sent to the all-nodes group, ignore it.
 		 */
 		struct in6_addr		 dst;
 
 		dst = ip6->ip6_dst;
 		in6_clearscope(&dst);
 		if (!IN6_ARE_ADDR_EQUAL(&dst, &in6addr_linklocal_allnodes))
 			return (EINVAL);
 		is_general_query = 1;
 	} else {
 		/*
 		 * Embed scope ID of receiving interface in MLD query for
 		 * lookup whilst we don't hold other locks.
 		 */
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 	}
 
 	IN6_MULTI_LIST_LOCK();
 	MLD_LOCK();
 
 	/*
 	 * Switch to MLDv1 host compatibility mode.
 	 */
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
 	mld_set_version(mli, MLD_VERSION_1);
 
 	timer = (ntohs(mld->mld_maxdelay) * MLD_FASTHZ) / MLD_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	if (is_general_query) {
 		/*
 		 * For each reporting group joined on this
 		 * interface, kick the report timer.
 		 */
 		CTR2(KTR_MLD, "process v1 general query on ifp %p(%s)",
 			 ifp, if_name(ifp));
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			inm = in6m_ifmultiaddr_get_inm(ifma);
 			if (inm == NULL)
 				continue;
 			mld_v1_update_group(inm, timer);
 		}
 	} else {
 		/*
 		 * MLDv1 Group-Specific Query.
 		 * If this is a group-specific MLDv1 query, we need only
 		 * look up the single group to process it.
 		 */
 		inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 		if (inm != NULL) {
 			CTR3(KTR_MLD, "process v1 query %s on ifp %p(%s)",
 			    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 			    ifp, if_name(ifp));
 			mld_v1_update_group(inm, timer);
 		}
 		/* XXX Clear embedded scope ID as userland won't expect it. */
 		in6_clearscope(&mld->mld_addr);
 	}
 
 	MLD_UNLOCK();
 	IN6_MULTI_LIST_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Update the report timer on a group in response to an MLDv1 query.
  *
  * If we are becoming the reporting member for this group, start the timer.
  * If we already are the reporting member for this group, and timer is
  * below the threshold, reset it.
  *
  * We may be updating the group for the first time since we switched
  * to MLDv2. If we are, then we must clear any recorded source lists,
  * and transition to REPORTING state; the group timer is overloaded
  * for group and group-source query responses. 
  *
  * Unlike MLDv2, the delay per group should be jittered
  * to avoid bursts of MLDv1 reports.
  */
 static void
 mld_v1_update_group(struct in6_multi *inm, const int timer)
 {
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: %s/%s timer=%d", __func__,
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    if_name(inm->in6m_ifp), timer);
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 		break;
 	case MLD_REPORTING_MEMBER:
 		if (inm->in6m_timer != 0 &&
 		    inm->in6m_timer <= timer) {
 			CTR1(KTR_MLD, "%s: REPORTING and timer running, "
 			    "skipping.", __func__);
 			break;
 		}
 		/* FALLTHROUGH */
 	case MLD_SG_QUERY_PENDING_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		CTR1(KTR_MLD, "%s: ->REPORTING", __func__);
 		inm->in6m_state = MLD_REPORTING_MEMBER;
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		break;
 	case MLD_SLEEPING_MEMBER:
 		CTR1(KTR_MLD, "%s: ->AWAKENING", __func__);
 		inm->in6m_state = MLD_AWAKENING_MEMBER;
 		break;
 	case MLD_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Process a received MLDv2 general, group-specific or
  * group-and-source-specific query.
  *
  * Assumes that mld points to a struct mldv2_query which is stored in
  * contiguous memory.
  *
  * Return 0 if successful, otherwise an appropriate error code is returned.
  */
 static int
 mld_v2_input_query(struct ifnet *ifp, const struct ip6_hdr *ip6,
     struct mbuf *m, struct mldv2_query *mld, const int off, const int icmp6len)
 {
 	struct mld_ifsoftc	*mli;
 	struct in6_multi	*inm;
 	uint32_t		 maxdelay, nsrc, qqi;
 	int			 is_general_query;
 	uint16_t		 timer;
 	uint8_t			 qrv;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	NET_EPOCH_ASSERT();
 
 	if (!mld_v2enable) {
 		CTR3(KTR_MLD, "ignore v2 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	/*
 	 * RFC3810 Section 6.2: MLD queries must originate from
 	 * a router's link-local address.
 	 */
 	if (!IN6_IS_SCOPE_LINKLOCAL(&ip6->ip6_src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	is_general_query = 0;
 
 	CTR2(KTR_MLD, "input v2 query on ifp %p(%s)", ifp, if_name(ifp));
 
 	maxdelay = ntohs(mld->mld_maxdelay);	/* in 1/10ths of a second */
 	if (maxdelay >= 32768) {
 		maxdelay = (MLD_MRC_MANT(maxdelay) | 0x1000) <<
 			   (MLD_MRC_EXP(maxdelay) + 3);
 	}
 	timer = (maxdelay * MLD_FASTHZ) / MLD_TIMER_SCALE;
 	if (timer == 0)
 		timer = 1;
 
 	qrv = MLD_QRV(mld->mld_misc);
 	if (qrv < 2) {
 		CTR3(KTR_MLD, "%s: clamping qrv %d to %d", __func__,
 		    qrv, MLD_RV_INIT);
 		qrv = MLD_RV_INIT;
 	}
 
 	qqi = mld->mld_qqi;
 	if (qqi >= 128) {
 		qqi = MLD_QQIC_MANT(mld->mld_qqi) <<
 		     (MLD_QQIC_EXP(mld->mld_qqi) + 3);
 	}
 
 	nsrc = ntohs(mld->mld_numsrc);
 	if (nsrc > MLD_MAX_GS_SOURCES)
 		return (EMSGSIZE);
 	if (icmp6len < sizeof(struct mldv2_query) +
 	    (nsrc * sizeof(struct in6_addr)))
 		return (EMSGSIZE);
 
 	/*
 	 * Do further input validation upfront to avoid resetting timers
 	 * should we need to discard this query.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr)) {
 		/*
 		 * A general query with a source list has undefined
 		 * behaviour; discard it.
 		 */
 		if (nsrc > 0)
 			return (EINVAL);
 		is_general_query = 1;
 	} else {
 		/*
 		 * Embed scope ID of receiving interface in MLD query for
 		 * lookup whilst we don't hold other locks (due to KAME
 		 * locking lameness). We own this mbuf chain just now.
 		 */
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 	}
 
 	IN6_MULTI_LIST_LOCK();
 	MLD_LOCK();
 
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
 
 	/*
 	 * Discard the v2 query if we're in Compatibility Mode.
 	 * The RFC is pretty clear that hosts need to stay in MLDv1 mode
 	 * until the Old Version Querier Present timer expires.
 	 */
 	if (mli->mli_version != MLD_VERSION_2)
 		goto out_locked;
 
 	mld_set_version(mli, MLD_VERSION_2);
 	mli->mli_rv = qrv;
 	mli->mli_qi = qqi;
 	mli->mli_qri = maxdelay;
 
 	CTR4(KTR_MLD, "%s: qrv %d qi %d maxdelay %d", __func__, qrv, qqi,
 	    maxdelay);
 
 	if (is_general_query) {
 		/*
 		 * MLDv2 General Query.
 		 *
 		 * Schedule a current-state report on this ifp for
 		 * all groups, possibly containing source lists.
 		 *
 		 * If there is a pending General Query response
 		 * scheduled earlier than the selected delay, do
 		 * not schedule any other reports.
 		 * Otherwise, reset the interface timer.
 		 */
 		CTR2(KTR_MLD, "process v2 general query on ifp %p(%s)",
 		    ifp, if_name(ifp));
 		if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer) {
 			mli->mli_v2_timer = MLD_RANDOM_DELAY(timer);
 			V_interface_timers_running6 = 1;
 		}
 	} else {
 		/*
 		 * MLDv2 Group-specific or Group-and-source-specific Query.
 		 *
 		 * Group-source-specific queries are throttled on
 		 * a per-group basis to defeat denial-of-service attempts.
 		 * Queries for groups we are not a member of on this
 		 * link are simply ignored.
 		 */
 		inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 		if (inm == NULL)
 			goto out_locked;
 		if (nsrc > 0) {
 			if (!ratecheck(&inm->in6m_lastgsrtv,
 			    &V_mld_gsrdelay)) {
 				CTR1(KTR_MLD, "%s: GS query throttled.",
 				    __func__);
 				goto out_locked;
 			}
 		}
 		CTR2(KTR_MLD, "process v2 group query on ifp %p(%s)",
 		     ifp, if_name(ifp));
 		/*
 		 * If there is a pending General Query response
 		 * scheduled sooner than the selected delay, no
 		 * further report need be scheduled.
 		 * Otherwise, prepare to respond to the
 		 * group-specific or group-and-source query.
 		 */
 		if (mli->mli_v2_timer == 0 || mli->mli_v2_timer >= timer)
 			mld_v2_process_group_query(inm, mli, timer, m, mld, off);
 
 		/* XXX Clear embedded scope ID as userland won't expect it. */
 		in6_clearscope(&mld->mld_addr);
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_LIST_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Process a received MLDv2 group-specific or group-and-source-specific
  * query.
  * Return <0 if any error occurred. Currently this is ignored.
  */
 static int
 mld_v2_process_group_query(struct in6_multi *inm, struct mld_ifsoftc *mli,
     int timer, struct mbuf *m0, struct mldv2_query *mld, const int off)
 {
 	int			 retval;
 	uint16_t		 nsrc;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	retval = 0;
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		return (retval);
 		break;
 	case MLD_REPORTING_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		break;
 	}
 
 	nsrc = ntohs(mld->mld_numsrc);
 
 	/* Length should be checked by calling function. */
 	KASSERT((m0->m_flags & M_PKTHDR) == 0 ||
 	    m0->m_pkthdr.len >= off + sizeof(struct mldv2_query) +
 	    nsrc * sizeof(struct in6_addr),
 	    ("mldv2 packet is too short: (%d bytes < %zd bytes, m=%p)",
 	    m0->m_pkthdr.len, off + sizeof(struct mldv2_query) +
 	    nsrc * sizeof(struct in6_addr), m0));
 
 	/*
 	 * Deal with group-specific queries upfront.
 	 * If any group query is already pending, purge any recorded
 	 * source-list state if it exists, and schedule a query response
 	 * for this group-specific query.
 	 */
 	if (nsrc == 0) {
 		if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
 		    inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER) {
 			in6m_clear_recorded(inm);
 			timer = min(inm->in6m_timer, timer);
 		}
 		inm->in6m_state = MLD_G_QUERY_PENDING_MEMBER;
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		return (retval);
 	}
 
 	/*
 	 * Deal with the case where a group-and-source-specific query has
 	 * been received but a group-specific query is already pending.
 	 */
 	if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER) {
 		timer = min(inm->in6m_timer, timer);
 		inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 		V_current_state_timers_running6 = 1;
 		return (retval);
 	}
 
 	/*
 	 * Finally, deal with the case where a group-and-source-specific
 	 * query has been received, where a response to a previous g-s-r
 	 * query exists, or none exists.
 	 * In this case, we need to parse the source-list which the Querier
 	 * has provided us with and check if we have any source list filter
 	 * entries at T1 for these sources. If we do not, there is no need
 	 * schedule a report and the query may be dropped.
 	 * If we do, we must record them and schedule a current-state
 	 * report for those sources.
 	 */
 	if (inm->in6m_nsrc > 0) {
 		struct in6_addr		 srcaddr;
 		int			 i, nrecorded;
 		int			 soff;
 
 		soff = off + sizeof(struct mldv2_query);
 		nrecorded = 0;
 		for (i = 0; i < nsrc; i++) {
 			m_copydata(m0, soff, sizeof(struct in6_addr),
 			    (caddr_t)&srcaddr);
 			retval = in6m_record_source(inm, &srcaddr);
 			if (retval < 0)
 				break;
 			nrecorded += retval;
 			soff += sizeof(struct in6_addr);
 		}
 		if (nrecorded > 0) {
 			CTR1(KTR_MLD,
 			    "%s: schedule response to SG query", __func__);
 			inm->in6m_state = MLD_SG_QUERY_PENDING_MEMBER;
 			inm->in6m_timer = MLD_RANDOM_DELAY(timer);
 			V_current_state_timers_running6 = 1;
 		}
 	}
 
 	return (retval);
 }
 
 /*
  * Process a received MLDv1 host membership report.
  * Assumes mld points to mld_hdr in pulled up mbuf chain.
  *
  * NOTE: Can't be fully const correct as we temporarily embed scope ID in
  * mld_addr. This is OK as we own the mbuf chain.
  */
 static int
 mld_v1_input_report(struct ifnet *ifp, const struct ip6_hdr *ip6,
     /*const*/ struct mld_hdr *mld)
 {
 	struct in6_addr		 src, dst;
 	struct in6_ifaddr	*ia;
 	struct in6_multi	*inm;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	NET_EPOCH_ASSERT();
 
 	if (!mld_v1enable) {
 		CTR3(KTR_MLD, "ignore v1 report %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 		    ifp, if_name(ifp));
 		return (0);
 	}
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		return (0);
 
 	/*
 	 * MLDv1 reports must originate from a host's link-local address,
 	 * or the unspecified address (when booting).
 	 */
 	src = ip6->ip6_src;
 	in6_clearscope(&src);
 	if (!IN6_IS_SCOPE_LINKLOCAL(&src) && !IN6_IS_ADDR_UNSPECIFIED(&src)) {
 		CTR3(KTR_MLD, "ignore v1 query src %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_src),
 		    ifp, if_name(ifp));
 		return (EINVAL);
 	}
 
 	/*
 	 * RFC2710 Section 4: MLDv1 reports must pertain to a multicast
 	 * group, and must be directed to the group itself.
 	 */
 	dst = ip6->ip6_dst;
 	in6_clearscope(&dst);
 	if (!IN6_IS_ADDR_MULTICAST(&mld->mld_addr) ||
 	    !IN6_ARE_ADDR_EQUAL(&mld->mld_addr, &dst)) {
 		CTR3(KTR_MLD, "ignore v1 query dst %s on ifp %p(%s)",
 		    ip6_sprintf(ip6tbuf, &ip6->ip6_dst),
 		    ifp, if_name(ifp));
 		return (EINVAL);
 	}
 
 	/*
 	 * Make sure we don't hear our own membership report, as fast
 	 * leave requires knowing that we are the only member of a
 	 * group. Assume we used the link-local address if available,
 	 * otherwise look for ::.
 	 *
 	 * XXX Note that scope ID comparison is needed for the address
 	 * returned by in6ifa_ifpforlinklocal(), but SHOULD NOT be
 	 * performed for the on-wire address.
 	 */
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	if ((ia && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, IA6_IN6(ia))) ||
 	    (ia == NULL && IN6_IS_ADDR_UNSPECIFIED(&src))) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (0);
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 
 	CTR3(KTR_MLD, "process v1 report %s on ifp %p(%s)",
 	    ip6_sprintf(ip6tbuf, &mld->mld_addr), ifp, if_name(ifp));
 
 	/*
 	 * Embed scope ID of receiving interface in MLD query for lookup
 	 * whilst we don't hold other locks (due to KAME locking lameness).
 	 */
 	if (!IN6_IS_ADDR_UNSPECIFIED(&mld->mld_addr))
 		in6_setscope(&mld->mld_addr, ifp, NULL);
 
 	IN6_MULTI_LIST_LOCK();
 	MLD_LOCK();
 
 	/*
 	 * MLDv1 report suppression.
 	 * If we are a member of this group, and our membership should be
 	 * reported, and our group timer is pending or about to be reset,
 	 * stop our group timer by transitioning to the 'lazy' state.
 	 */
 	inm = in6m_lookup_locked(ifp, &mld->mld_addr);
 	if (inm != NULL) {
 		struct mld_ifsoftc *mli;
 
 		mli = inm->in6m_mli;
 		KASSERT(mli != NULL,
 		    ("%s: no mli for ifp %p", __func__, ifp));
 
 		/*
 		 * If we are in MLDv2 host mode, do not allow the
 		 * other host's MLDv1 report to suppress our reports.
 		 */
 		if (mli->mli_version == MLD_VERSION_2)
 			goto out_locked;
 
 		inm->in6m_timer = 0;
 
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 			break;
 		case MLD_REPORTING_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			CTR3(KTR_MLD,
 			    "report suppressed for %s on ifp %p(%s)",
 			    ip6_sprintf(ip6tbuf, &mld->mld_addr),
 			    ifp, if_name(ifp));
 		case MLD_LAZY_MEMBER:
 			inm->in6m_state = MLD_LAZY_MEMBER;
 			break;
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 		case MLD_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_LIST_UNLOCK();
 
 	/* XXX Clear embedded scope ID as userland won't expect it. */
 	in6_clearscope(&mld->mld_addr);
 
 	return (0);
 }
 
 /*
  * MLD input path.
  *
  * Assume query messages which fit in a single ICMPv6 message header
  * have been pulled up.
  * Assume that userland will want to see the message, even if it
  * otherwise fails kernel input validation; do not free it.
  * Pullup may however free the mbuf chain m if it fails.
  *
  * Return IPPROTO_DONE if we freed m. Otherwise, return 0.
  */
 int
 mld_input(struct mbuf **mp, int off, int icmp6len)
 {
 	struct ifnet	*ifp;
 	struct ip6_hdr	*ip6;
 	struct mbuf	*m;
 	struct mld_hdr	*mld;
 	int		 mldlen;
 
 	m = *mp;
 	CTR3(KTR_MLD, "%s: called w/mbuf (%p,%d)", __func__, m, off);
 
 	ifp = m->m_pkthdr.rcvif;
 
 	/* Pullup to appropriate size. */
 	if (m->m_len < off + sizeof(*mld)) {
 		m = m_pullup(m, off + sizeof(*mld));
 		if (m == NULL) {
 			ICMP6STAT_INC(icp6s_badlen);
 			return (IPPROTO_DONE);
 		}
 	}
 	mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off);
 	if (mld->mld_type == MLD_LISTENER_QUERY &&
 	    icmp6len >= sizeof(struct mldv2_query)) {
 		mldlen = sizeof(struct mldv2_query);
 	} else {
 		mldlen = sizeof(struct mld_hdr);
 	}
 	if (m->m_len < off + mldlen) {
 		m = m_pullup(m, off + mldlen);
 		if (m == NULL) {
 			ICMP6STAT_INC(icp6s_badlen);
 			return (IPPROTO_DONE);
 		}
 	}
 	*mp = m;
 	ip6 = mtod(m, struct ip6_hdr *);
 	mld = (struct mld_hdr *)(mtod(m, uint8_t *) + off);
 
 	/*
 	 * Userland needs to see all of this traffic for implementing
 	 * the endpoint discovery portion of multicast routing.
 	 */
 	switch (mld->mld_type) {
 	case MLD_LISTENER_QUERY:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldquery);
 		if (icmp6len == sizeof(struct mld_hdr)) {
 			if (mld_v1_input_query(ifp, ip6, mld) != 0)
 				return (0);
 		} else if (icmp6len >= sizeof(struct mldv2_query)) {
 			if (mld_v2_input_query(ifp, ip6, m,
 			    (struct mldv2_query *)mld, off, icmp6len) != 0)
 				return (0);
 		}
 		break;
 	case MLD_LISTENER_REPORT:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
 		if (mld_v1_input_report(ifp, ip6, mld) != 0)
 			return (0);
 		break;
 	case MLDV2_LISTENER_REPORT:
 		icmp6_ifstat_inc(ifp, ifs6_in_mldreport);
 		break;
 	case MLD_LISTENER_DONE:
 		icmp6_ifstat_inc(ifp, ifs6_in_mlddone);
 		break;
 	default:
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * Fast timeout handler (global).
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 static struct callout mldfast_callout;
 static void
 mld_fasttimo(void *arg __unused)
 {
 	struct epoch_tracker et;
 	struct in6_multi_head inmh;
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	SLIST_INIT(&inmh);
 
 	NET_EPOCH_ENTER(et);
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		mld_fasttimo_vnet(&inmh);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	NET_EPOCH_EXIT(et);
 	in6m_release_list_deferred(&inmh);
 
 	callout_reset(&mldfast_callout, hz / MLD_FASTHZ, mld_fasttimo, NULL);
 }
 
 /*
  * Fast timeout handler (per-vnet).
  *
  * VIMAGE: Assume caller has set up our curvnet.
  */
 static void
 mld_fasttimo_vnet(struct in6_multi_head *inmh)
 {
 	struct mbufq		 scq;	/* State-change packets */
 	struct mbufq		 qrq;	/* Query response packets */
 	struct ifnet		*ifp;
 	struct mld_ifsoftc	*mli;
 	struct ifmultiaddr	*ifma;
 	struct in6_multi	*inm;
 	int			 uri_fasthz;
 
 	uri_fasthz = 0;
 
 	/*
 	 * Quick check to see if any work needs to be done, in order to
 	 * minimize the overhead of fasttimo processing.
 	 * SMPng: XXX Unlocked reads.
 	 */
 	if (!V_current_state_timers_running6 &&
 	    !V_interface_timers_running6 &&
 	    !V_state_change_timers_running6)
 		return;
 
 	IN6_MULTI_LIST_LOCK();
 	MLD_LOCK();
 
 	/*
 	 * MLDv2 General Query response timer processing.
 	 */
 	if (V_interface_timers_running6) {
 		CTR1(KTR_MLD, "%s: interface timers running", __func__);
 
 		V_interface_timers_running6 = 0;
 		LIST_FOREACH(mli, &V_mli_head, mli_link) {
 			if (mli->mli_v2_timer == 0) {
 				/* Do nothing. */
 			} else if (--mli->mli_v2_timer == 0) {
 				mld_v2_dispatch_general_query(mli);
 			} else {
 				V_interface_timers_running6 = 1;
 			}
 		}
 	}
 
 	if (!V_current_state_timers_running6 &&
 	    !V_state_change_timers_running6)
 		goto out_locked;
 
 	V_current_state_timers_running6 = 0;
 	V_state_change_timers_running6 = 0;
 
 	CTR1(KTR_MLD, "%s: state change timers running", __func__);
 
 	/*
 	 * MLD host report and state-change timer processing.
 	 * Note: Processing a v2 group timer may remove a node.
 	 */
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		ifp = mli->mli_ifp;
 
 		if (mli->mli_version == MLD_VERSION_2) {
 			uri_fasthz = MLD_RANDOM_DELAY(mli->mli_uri *
 			    MLD_FASTHZ);
 			mbufq_init(&qrq, MLD_MAX_G_GS_PACKETS);
 			mbufq_init(&scq, MLD_MAX_STATE_CHANGE_PACKETS);
 		}
 
 		IF_ADDR_WLOCK(ifp);
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			inm = in6m_ifmultiaddr_get_inm(ifma);
 			if (inm == NULL)
 				continue;
 			switch (mli->mli_version) {
 			case MLD_VERSION_1:
 				mld_v1_process_group_timer(inmh, inm);
 				break;
 			case MLD_VERSION_2:
 				mld_v2_process_group_timers(inmh, &qrq,
 				    &scq, inm, uri_fasthz);
 				break;
 			}
 		}
 		IF_ADDR_WUNLOCK(ifp);
 
 		switch (mli->mli_version) {
 		case MLD_VERSION_1:
 			/*
 			 * Transmit reports for this lifecycle.  This
 			 * is done while not holding IF_ADDR_LOCK
 			 * since this can call
 			 * in6ifa_ifpforlinklocal() which locks
 			 * IF_ADDR_LOCK internally as well as
 			 * ip6_output() to transmit a packet.
 			 */
 			while ((inm = SLIST_FIRST(inmh)) != NULL) {
 				SLIST_REMOVE_HEAD(inmh, in6m_defer);
 				(void)mld_v1_transmit_report(inm,
 				    MLD_LISTENER_REPORT);
 			}
 			break;
 		case MLD_VERSION_2:
 			mld_dispatch_queue(&qrq, 0);
 			mld_dispatch_queue(&scq, 0);
 			break;
 		}
 	}
 
 out_locked:
 	MLD_UNLOCK();
 	IN6_MULTI_LIST_UNLOCK();
 }
 
 /*
  * Update host report group timer.
  * Will update the global pending timer flags.
  */
 static void
 mld_v1_process_group_timer(struct in6_multi_head *inmh, struct in6_multi *inm)
 {
 	int report_timer_expired;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	if (inm->in6m_timer == 0) {
 		report_timer_expired = 0;
 	} else if (--inm->in6m_timer == 0) {
 		report_timer_expired = 1;
 	} else {
 		V_current_state_timers_running6 = 1;
 		return;
 	}
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		break;
 	case MLD_REPORTING_MEMBER:
 		if (report_timer_expired) {
 			inm->in6m_state = MLD_IDLE_MEMBER;
 			SLIST_INSERT_HEAD(inmh, inm, in6m_defer);
 		}
 		break;
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		break;
 	}
 }
 
 /*
  * Update a group's timers for MLDv2.
  * Will update the global pending timer flags.
  * Note: Unlocked read from mli.
  */
 static void
 mld_v2_process_group_timers(struct in6_multi_head *inmh,
     struct mbufq *qrq, struct mbufq *scq,
     struct in6_multi *inm, const int uri_fasthz)
 {
 	int query_response_timer_expired;
 	int state_change_retransmit_timer_expired;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	query_response_timer_expired = 0;
 	state_change_retransmit_timer_expired = 0;
 
 	/*
 	 * During a transition from compatibility mode back to MLDv2,
 	 * a group record in REPORTING state may still have its group
 	 * timer active. This is a no-op in this function; it is easier
 	 * to deal with it here than to complicate the slow-timeout path.
 	 */
 	if (inm->in6m_timer == 0) {
 		query_response_timer_expired = 0;
 	} else if (--inm->in6m_timer == 0) {
 		query_response_timer_expired = 1;
 	} else {
 		V_current_state_timers_running6 = 1;
 	}
 
 	if (inm->in6m_sctimer == 0) {
 		state_change_retransmit_timer_expired = 0;
 	} else if (--inm->in6m_sctimer == 0) {
 		state_change_retransmit_timer_expired = 1;
 	} else {
 		V_state_change_timers_running6 = 1;
 	}
 
 	/* We are in fasttimo, so be quick about it. */
 	if (!state_change_retransmit_timer_expired &&
 	    !query_response_timer_expired)
 		return;
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_LAZY_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 	case MLD_IDLE_MEMBER:
 		break;
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		/*
 		 * Respond to a previously pending Group-Specific
 		 * or Group-and-Source-Specific query by enqueueing
 		 * the appropriate Current-State report for
 		 * immediate transmission.
 		 */
 		if (query_response_timer_expired) {
 			int retval __unused;
 
 			retval = mld_v2_enqueue_group_record(qrq, inm, 0, 1,
 			    (inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER),
 			    0);
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			in6m_clear_recorded(inm);
 		}
 		/* FALLTHROUGH */
 	case MLD_REPORTING_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		if (state_change_retransmit_timer_expired) {
 			/*
 			 * State-change retransmission timer fired.
 			 * If there are any further pending retransmissions,
 			 * set the global pending state-change flag, and
 			 * reset the timer.
 			 */
 			if (--inm->in6m_scrv > 0) {
 				inm->in6m_sctimer = uri_fasthz;
 				V_state_change_timers_running6 = 1;
 			}
 			/*
 			 * Retransmit the previously computed state-change
 			 * report. If there are no further pending
 			 * retransmissions, the mbuf queue will be consumed.
 			 * Update T0 state to T1 as we have now sent
 			 * a state-change.
 			 */
 			(void)mld_v2_merge_state_changes(inm, scq);
 
 			in6m_commit(inm);
 			CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 			    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 			    if_name(inm->in6m_ifp));
 
 			/*
 			 * If we are leaving the group for good, make sure
 			 * we release MLD's reference to it.
 			 * This release must be deferred using a SLIST,
 			 * as we are called from a loop which traverses
 			 * the in_ifmultiaddr TAILQ.
 			 */
 			if (inm->in6m_state == MLD_LEAVING_MEMBER &&
 			    inm->in6m_scrv == 0) {
 				inm->in6m_state = MLD_NOT_MEMBER;
 				in6m_disconnect_locked(inmh, inm);
 				in6m_rele_locked(inmh, inm);
 			}
 		}
 		break;
 	}
 }
 
 /*
  * Switch to a different version on the given interface,
  * as per Section 9.12.
  */
 static void
 mld_set_version(struct mld_ifsoftc *mli, const int version)
 {
 	int old_version_timer;
 
 	MLD_LOCK_ASSERT();
 
 	CTR4(KTR_MLD, "%s: switching to v%d on ifp %p(%s)", __func__,
 	    version, mli->mli_ifp, if_name(mli->mli_ifp));
 
 	if (version == MLD_VERSION_1) {
 		/*
 		 * Compute the "Older Version Querier Present" timer as per
 		 * Section 9.12.
 		 */
 		old_version_timer = (mli->mli_rv * mli->mli_qi) + mli->mli_qri;
 		old_version_timer *= MLD_SLOWHZ;
 		mli->mli_v1_timer = old_version_timer;
 	}
 
 	if (mli->mli_v1_timer > 0 && mli->mli_version != MLD_VERSION_1) {
 		mli->mli_version = MLD_VERSION_1;
 		mld_v2_cancel_link_timers(mli);
 	}
 }
 
 /*
  * Cancel pending MLDv2 timers for the given link and all groups
  * joined on it; state-change, general-query, and group-query timers.
  */
 static void
 mld_v2_cancel_link_timers(struct mld_ifsoftc *mli)
 {
 	struct epoch_tracker	 et;
 	struct in6_multi_head	 inmh;
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in6_multi	*inm;
 
 	CTR3(KTR_MLD, "%s: cancel v2 timers on ifp %p(%s)", __func__,
 	    mli->mli_ifp, if_name(mli->mli_ifp));
 
 	SLIST_INIT(&inmh);
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	/*
 	 * Fast-track this potentially expensive operation
 	 * by checking all the global 'timer pending' flags.
 	 */
 	if (!V_interface_timers_running6 &&
 	    !V_state_change_timers_running6 &&
 	    !V_current_state_timers_running6)
 		return;
 
 	mli->mli_v2_timer = 0;
 
 	ifp = mli->mli_ifp;
 
 	IF_ADDR_WLOCK(ifp);
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = in6m_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_LAZY_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			break;
 		case MLD_LEAVING_MEMBER:
 			/*
 			 * If we are leaving the group and switching
 			 * version, we need to release the final
 			 * reference held for issuing the INCLUDE {}.
 			 */
 			if (inm->in6m_refcount == 1)
 				in6m_disconnect_locked(&inmh, inm);
 			in6m_rele_locked(&inmh, inm);
 			/* FALLTHROUGH */
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 			in6m_clear_recorded(inm);
 			/* FALLTHROUGH */
 		case MLD_REPORTING_MEMBER:
 			inm->in6m_sctimer = 0;
 			inm->in6m_timer = 0;
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			/*
 			 * Free any pending MLDv2 state-change records.
 			 */
 			mbufq_drain(&inm->in6m_scq);
 			break;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 	IF_ADDR_WUNLOCK(ifp);
 	in6m_release_list_deferred(&inmh);
 }
 
 /*
  * Global slowtimo handler.
  * VIMAGE: Timeout handlers are expected to service all vimages.
  */
 static struct callout mldslow_callout;
 static void
 mld_slowtimo(void *arg __unused)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		mld_slowtimo_vnet();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 
 	callout_reset(&mldslow_callout, hz / MLD_SLOWHZ, mld_slowtimo, NULL);
 }
 
 /*
  * Per-vnet slowtimo handler.
  */
 static void
 mld_slowtimo_vnet(void)
 {
 	struct mld_ifsoftc *mli;
 
 	MLD_LOCK();
 
 	LIST_FOREACH(mli, &V_mli_head, mli_link) {
 		mld_v1_process_querier_timers(mli);
 	}
 
 	MLD_UNLOCK();
 }
 
 /*
  * Update the Older Version Querier Present timers for a link.
  * See Section 9.12 of RFC 3810.
  */
 static void
 mld_v1_process_querier_timers(struct mld_ifsoftc *mli)
 {
 
 	MLD_LOCK_ASSERT();
 
 	if (mli->mli_version != MLD_VERSION_2 && --mli->mli_v1_timer == 0) {
 		/*
 		 * MLDv1 Querier Present timer expired; revert to MLDv2.
 		 */
 		CTR5(KTR_MLD,
 		    "%s: transition from v%d -> v%d on %p(%s)",
 		    __func__, mli->mli_version, MLD_VERSION_2,
 		    mli->mli_ifp, if_name(mli->mli_ifp));
 		mli->mli_version = MLD_VERSION_2;
 	}
 }
 
 /*
  * Transmit an MLDv1 report immediately.
  */
 static int
 mld_v1_transmit_report(struct in6_multi *in6m, const int type)
 {
 	struct ifnet		*ifp;
 	struct in6_ifaddr	*ia;
 	struct ip6_hdr		*ip6;
 	struct mbuf		*mh, *md;
 	struct mld_hdr		*mld;
 
 	NET_EPOCH_ASSERT();
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	ifp = in6m->in6m_ifp;
 	/* in process of being freed */
 	if (ifp == NULL)
 		return (0);
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	/* ia may be NULL if link-local address is tentative. */
 
 	mh = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mh == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (ENOMEM);
 	}
 	md = m_get(M_NOWAIT, MT_DATA);
 	if (md == NULL) {
 		m_free(mh);
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		return (ENOMEM);
 	}
 	mh->m_next = md;
 
 	/*
 	 * FUTURE: Consider increasing alignment by ETHER_HDR_LEN, so
 	 * that ether_output() does not need to allocate another mbuf
 	 * for the header in the most common case.
 	 */
 	M_ALIGN(mh, sizeof(struct ip6_hdr));
 	mh->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
 	mh->m_len = sizeof(struct ip6_hdr);
 
 	ip6 = mtod(mh, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
 	ip6->ip6_dst = in6m->in6m_addr;
 
 	md->m_len = sizeof(struct mld_hdr);
 	mld = mtod(md, struct mld_hdr *);
 	mld->mld_type = type;
 	mld->mld_code = 0;
 	mld->mld_cksum = 0;
 	mld->mld_maxdelay = 0;
 	mld->mld_reserved = 0;
 	mld->mld_addr = in6m->in6m_addr;
 	in6_clearscope(&mld->mld_addr);
 	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), sizeof(struct mld_hdr));
 
 	mld_save_context(mh, ifp);
 	mh->m_flags |= M_MLDV1;
 
 	mld_dispatch_packet(mh);
 
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	return (0);
 }
 
 /*
  * Process a state change from the upper layer for the given IPv6 group.
  *
  * Each socket holds a reference on the in_multi in its own ip_moptions.
  * The socket layer will have made the necessary updates to.the group
  * state, it is now up to MLD to issue a state change report if there
  * has been any change between T0 (when the last state-change was issued)
  * and T1 (now).
  *
  * We use the MLDv2 state machine at group level. The MLd module
  * however makes the decision as to which MLD protocol version to speak.
  * A state change *from* INCLUDE {} always means an initial join.
  * A state change *to* INCLUDE {} always means a final leave.
  *
  * If delay is non-zero, and the state change is an initial multicast
  * join, the state change report will be delayed by 'delay' ticks
  * in units of MLD_FASTHZ if MLDv1 is active on the link; otherwise
  * the initial MLDv2 state change report will be delayed by whichever
  * is sooner, a pending state-change timer or delay itself.
  *
  * VIMAGE: curvnet should have been set by caller, as this routine
  * is called from the socket option handlers.
  */
 int
 mld_change_state(struct in6_multi *inm, const int delay)
 {
 	struct mld_ifsoftc *mli;
 	struct ifnet *ifp;
 	int error;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	error = 0;
 
 	/*
 	 * Check if the in6_multi has already been disconnected.
 	 */
 	if (inm->in6m_ifp == NULL) {
 		CTR1(KTR_MLD, "%s: inm is disconnected", __func__);
 		return (0);
 	}
 
 	/*
 	 * Try to detect if the upper layer just asked us to change state
 	 * for an interface which has now gone away.
 	 */
 	KASSERT(inm->in6m_ifma != NULL, ("%s: no ifma", __func__));
 	ifp = inm->in6m_ifma->ifma_ifp;
 	if (ifp == NULL)
 		return (0);
 	/*
 	 * Sanity check that netinet6's notion of ifp is the
 	 * same as net's.
 	 */
 	KASSERT(inm->in6m_ifp == ifp, ("%s: bad ifp", __func__));
 
 	MLD_LOCK();
 	mli = MLD_IFINFO(ifp);
 	KASSERT(mli != NULL, ("%s: no mld_ifsoftc for ifp %p", __func__, ifp));
 
 	/*
 	 * If we detect a state transition to or from MCAST_UNDEFINED
 	 * for this group, then we are starting or finishing an MLD
 	 * life cycle for this group.
 	 */
 	if (inm->in6m_st[1].iss_fmode != inm->in6m_st[0].iss_fmode) {
 		CTR3(KTR_MLD, "%s: inm transition %d -> %d", __func__,
 		    inm->in6m_st[0].iss_fmode, inm->in6m_st[1].iss_fmode);
 		if (inm->in6m_st[0].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_MLD, "%s: initial join", __func__);
 			error = mld_initial_join(inm, mli, delay);
 			goto out_locked;
 		} else if (inm->in6m_st[1].iss_fmode == MCAST_UNDEFINED) {
 			CTR1(KTR_MLD, "%s: final leave", __func__);
 			mld_final_leave(inm, mli);
 			goto out_locked;
 		}
 	} else {
 		CTR1(KTR_MLD, "%s: filter set change", __func__);
 	}
 
 	error = mld_handle_state_change(inm, mli);
 
 out_locked:
 	MLD_UNLOCK();
 	return (error);
 }
 
 /*
  * Perform the initial join for an MLD group.
  *
  * When joining a group:
  *  If the group should have its MLD traffic suppressed, do nothing.
  *  MLDv1 starts sending MLDv1 host membership reports.
  *  MLDv2 will schedule an MLDv2 state-change report containing the
  *  initial state of the membership.
  *
  * If the delay argument is non-zero, then we must delay sending the
  * initial state change for delay ticks (in units of MLD_FASTHZ).
  */
 static int
 mld_initial_join(struct in6_multi *inm, struct mld_ifsoftc *mli,
     const int delay)
 {
 	struct epoch_tracker     et;
 	struct ifnet		*ifp;
 	struct mbufq		*mq;
 	int			 error, retval, syncstates;
 	int			 odelay;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: initial join %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	error = 0;
 	syncstates = 1;
 
 	ifp = inm->in6m_ifp;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli && mli->mli_ifp == ifp, ("%s: inconsistent ifp", __func__));
 
 	/*
 	 * Groups joined on loopback or marked as 'not reported',
 	 * enter the MLD_SILENT_MEMBER state and
 	 * are never reported in any protocol exchanges.
 	 * All other groups enter the appropriate state machine
 	 * for the version in use on this link.
 	 * A link marked as MLIF_SILENT causes MLD to be completely
 	 * disabled for the link.
 	 */
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (mli->mli_flags & MLIF_SILENT) ||
 	    !mld_is_addr_reported(&inm->in6m_addr)) {
 		CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		inm->in6m_state = MLD_SILENT_MEMBER;
 		inm->in6m_timer = 0;
 	} else {
 		/*
 		 * Deal with overlapping in_multi lifecycle.
 		 * If this group was LEAVING, then make sure
 		 * we drop the reference we picked up to keep the
 		 * group around for the final INCLUDE {} enqueue.
 		 */
 		if (mli->mli_version == MLD_VERSION_2 &&
 		    inm->in6m_state == MLD_LEAVING_MEMBER) {
 			inm->in6m_refcount--;
 			MPASS(inm->in6m_refcount > 0);
 		}
 		inm->in6m_state = MLD_REPORTING_MEMBER;
 
 		switch (mli->mli_version) {
 		case MLD_VERSION_1:
 			/*
 			 * If a delay was provided, only use it if
 			 * it is greater than the delay normally
 			 * used for an MLDv1 state change report,
 			 * and delay sending the initial MLDv1 report
 			 * by not transitioning to the IDLE state.
 			 */
 			odelay = MLD_RANDOM_DELAY(MLD_V1_MAX_RI * MLD_FASTHZ);
 			if (delay) {
 				inm->in6m_timer = max(delay, odelay);
 				V_current_state_timers_running6 = 1;
 			} else {
 				inm->in6m_state = MLD_IDLE_MEMBER;
 				NET_EPOCH_ENTER(et);
 				error = mld_v1_transmit_report(inm,
 				     MLD_LISTENER_REPORT);
 				NET_EPOCH_EXIT(et);
 				if (error == 0) {
 					inm->in6m_timer = odelay;
 					V_current_state_timers_running6 = 1;
 				}
 			}
 			break;
 
 		case MLD_VERSION_2:
 			/*
 			 * Defer update of T0 to T1, until the first copy
 			 * of the state change has been transmitted.
 			 */
 			syncstates = 0;
 
 			/*
 			 * Immediately enqueue a State-Change Report for
 			 * this interface, freeing any previous reports.
 			 * Don't kick the timers if there is nothing to do,
 			 * or if an error occurred.
 			 */
 			mq = &inm->in6m_scq;
 			mbufq_drain(mq);
 			retval = mld_v2_enqueue_group_record(mq, inm, 1,
 			    0, 0, (mli->mli_flags & MLIF_USEALLOW));
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			if (retval <= 0) {
 				error = retval * -1;
 				break;
 			}
 
 			/*
 			 * Schedule transmission of pending state-change
 			 * report up to RV times for this link. The timer
 			 * will fire at the next mld_fasttimo (~200ms),
 			 * giving us an opportunity to merge the reports.
 			 *
 			 * If a delay was provided to this function, only
 			 * use this delay if sooner than the existing one.
 			 */
 			KASSERT(mli->mli_rv > 1,
 			   ("%s: invalid robustness %d", __func__,
 			    mli->mli_rv));
 			inm->in6m_scrv = mli->mli_rv;
 			if (delay) {
 				if (inm->in6m_sctimer > 1) {
 					inm->in6m_sctimer =
 					    min(inm->in6m_sctimer, delay);
 				} else
 					inm->in6m_sctimer = delay;
 			} else
 				inm->in6m_sctimer = 1;
 			V_state_change_timers_running6 = 1;
 
 			error = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Only update the T0 state if state change is atomic,
 	 * i.e. we don't need to wait for a timer to fire before we
 	 * can consider the state change to have been communicated.
 	 */
 	if (syncstates) {
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 	}
 
 	return (error);
 }
 
 /*
  * Issue an intermediate state change during the life-cycle.
  */
 static int
 mld_handle_state_change(struct in6_multi *inm, struct mld_ifsoftc *mli)
 {
 	struct ifnet		*ifp;
 	int			 retval;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	CTR4(KTR_MLD, "%s: state change for %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	ifp = inm->in6m_ifp;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli && mli->mli_ifp == ifp,
 	    ("%s: inconsistent ifp", __func__));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) ||
 	    (mli->mli_flags & MLIF_SILENT) ||
 	    !mld_is_addr_reported(&inm->in6m_addr) ||
 	    (mli->mli_version != MLD_VERSION_2)) {
 		if (!mld_is_addr_reported(&inm->in6m_addr)) {
 			CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		}
 		CTR1(KTR_MLD, "%s: nothing to do", __func__);
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		return (0);
 	}
 
 	mbufq_drain(&inm->in6m_scq);
 
 	retval = mld_v2_enqueue_group_record(&inm->in6m_scq, inm, 1, 0, 0,
 	    (mli->mli_flags & MLIF_USEALLOW));
 	CTR2(KTR_MLD, "%s: enqueue record = %d", __func__, retval);
 	if (retval <= 0)
 		return (-retval);
 
 	/*
 	 * If record(s) were enqueued, start the state-change
 	 * report timer for this group.
 	 */
 	inm->in6m_scrv = mli->mli_rv;
 	inm->in6m_sctimer = 1;
 	V_state_change_timers_running6 = 1;
 
 	return (0);
 }
 
 /*
  * Perform the final leave for a multicast address.
  *
  * When leaving a group:
  *  MLDv1 sends a DONE message, if and only if we are the reporter.
  *  MLDv2 enqueues a state-change report containing a transition
  *  to INCLUDE {} for immediate transmission.
  */
 static void
 mld_final_leave(struct in6_multi *inm, struct mld_ifsoftc *mli)
 {
 	struct epoch_tracker     et;
 	int syncstates;
 #ifdef KTR
 	char ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	syncstates = 1;
 
 	CTR4(KTR_MLD, "%s: final leave %s on ifp %p(%s)",
 	    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    inm->in6m_ifp, if_name(inm->in6m_ifp));
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	switch (inm->in6m_state) {
 	case MLD_NOT_MEMBER:
 	case MLD_SILENT_MEMBER:
 	case MLD_LEAVING_MEMBER:
 		/* Already leaving or left; do nothing. */
 		CTR1(KTR_MLD,
 "%s: not kicking state machine for silent group", __func__);
 		break;
 	case MLD_REPORTING_MEMBER:
 	case MLD_IDLE_MEMBER:
 	case MLD_G_QUERY_PENDING_MEMBER:
 	case MLD_SG_QUERY_PENDING_MEMBER:
 		if (mli->mli_version == MLD_VERSION_1) {
 #ifdef INVARIANTS
 			if (inm->in6m_state == MLD_G_QUERY_PENDING_MEMBER ||
 			    inm->in6m_state == MLD_SG_QUERY_PENDING_MEMBER)
 			panic("%s: MLDv2 state reached, not MLDv2 mode",
 			     __func__);
 #endif
 			NET_EPOCH_ENTER(et);
 			mld_v1_transmit_report(inm, MLD_LISTENER_DONE);
 			NET_EPOCH_EXIT(et);
 			inm->in6m_state = MLD_NOT_MEMBER;
 			V_current_state_timers_running6 = 1;
 		} else if (mli->mli_version == MLD_VERSION_2) {
 			/*
 			 * Stop group timer and all pending reports.
 			 * Immediately enqueue a state-change report
 			 * TO_IN {} to be sent on the next fast timeout,
 			 * giving us an opportunity to merge reports.
 			 */
 			mbufq_drain(&inm->in6m_scq);
 			inm->in6m_timer = 0;
 			inm->in6m_scrv = mli->mli_rv;
 			CTR4(KTR_MLD, "%s: Leaving %s/%s with %d "
 			    "pending retransmissions.", __func__,
 			    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 			    if_name(inm->in6m_ifp), inm->in6m_scrv);
 			if (inm->in6m_scrv == 0) {
 				inm->in6m_state = MLD_NOT_MEMBER;
 				inm->in6m_sctimer = 0;
 			} else {
 				int retval __diagused;
 
 				in6m_acquire_locked(inm);
 
 				retval = mld_v2_enqueue_group_record(
 				    &inm->in6m_scq, inm, 1, 0, 0,
 				    (mli->mli_flags & MLIF_USEALLOW));
 				KASSERT(retval != 0,
 				    ("%s: enqueue record = %d", __func__,
 				     retval));
 
 				inm->in6m_state = MLD_LEAVING_MEMBER;
 				inm->in6m_sctimer = 1;
 				V_state_change_timers_running6 = 1;
 				syncstates = 0;
 			}
 			break;
 		}
 		break;
 	case MLD_LAZY_MEMBER:
 	case MLD_SLEEPING_MEMBER:
 	case MLD_AWAKENING_MEMBER:
 		/* Our reports are suppressed; do nothing. */
 		break;
 	}
 
 	if (syncstates) {
 		in6m_commit(inm);
 		CTR3(KTR_MLD, "%s: T1 -> T0 for %s/%s", __func__,
 		    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		inm->in6m_st[1].iss_fmode = MCAST_UNDEFINED;
 		CTR3(KTR_MLD, "%s: T1 now MCAST_UNDEFINED for %p/%s",
 		    __func__, &inm->in6m_addr, if_name(inm->in6m_ifp));
 	}
 }
 
 /*
  * Enqueue an MLDv2 group record to the given output queue.
  *
  * If is_state_change is zero, a current-state record is appended.
  * If is_state_change is non-zero, a state-change report is appended.
  *
  * If is_group_query is non-zero, an mbuf packet chain is allocated.
  * If is_group_query is zero, and if there is a packet with free space
  * at the tail of the queue, it will be appended to providing there
  * is enough free space.
  * Otherwise a new mbuf packet chain is allocated.
  *
  * If is_source_query is non-zero, each source is checked to see if
  * it was recorded for a Group-Source query, and will be omitted if
  * it is not both in-mode and recorded.
  *
  * If use_block_allow is non-zero, state change reports for initial join
  * and final leave, on an inclusive mode group with a source list, will be
  * rewritten to use the ALLOW_NEW and BLOCK_OLD record types, respectively.
  *
  * The function will attempt to allocate leading space in the packet
  * for the IPv6+ICMP headers to be prepended without fragmenting the chain.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 mld_v2_enqueue_group_record(struct mbufq *mq, struct in6_multi *inm,
     const int is_state_change, const int is_group_query,
     const int is_source_query, const int use_block_allow)
 {
 	struct mldv2_record	 mr;
 	struct mldv2_record	*pmr;
 	struct ifnet		*ifp;
 	struct ip6_msource	*ims, *nims;
 	struct mbuf		*m0, *m, *md;
 	int			 is_filter_list_change;
 	int			 minrec0len, m0srcs, msrcs, nbytes, off;
 	int			 record_has_sources;
 	int			 now;
 	int			 type;
 	uint8_t			 mode;
 #ifdef KTR
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	ifp = inm->in6m_ifp;
 	is_filter_list_change = 0;
 	m = NULL;
 	m0 = NULL;
 	m0srcs = 0;
 	msrcs = 0;
 	nbytes = 0;
 	nims = NULL;
 	record_has_sources = 1;
 	pmr = NULL;
 	type = MLD_DO_NOTHING;
 	mode = inm->in6m_st[1].iss_fmode;
 
 	/*
 	 * If we did not transition out of ASM mode during t0->t1,
 	 * and there are no source nodes to process, we can skip
 	 * the generation of source records.
 	 */
 	if (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0 &&
 	    inm->in6m_nsrc == 0)
 		record_has_sources = 0;
 
 	if (is_state_change) {
 		/*
 		 * Queue a state change record.
 		 * If the mode did not change, and there are non-ASM
 		 * listeners or source filters present,
 		 * we potentially need to issue two records for the group.
 		 * If there are ASM listeners, and there was no filter
 		 * mode transition of any kind, do nothing.
 		 *
 		 * If we are transitioning to MCAST_UNDEFINED, we need
 		 * not send any sources. A transition to/from this state is
 		 * considered inclusive with some special treatment.
 		 *
 		 * If we are rewriting initial joins/leaves to use
 		 * ALLOW/BLOCK, and the group's membership is inclusive,
 		 * we need to send sources in all cases.
 		 */
 		if (mode != inm->in6m_st[0].iss_fmode) {
 			if (mode == MCAST_EXCLUDE) {
 				CTR1(KTR_MLD, "%s: change to EXCLUDE",
 				    __func__);
 				type = MLD_CHANGE_TO_EXCLUDE_MODE;
 			} else {
 				CTR1(KTR_MLD, "%s: change to INCLUDE",
 				    __func__);
 				if (use_block_allow) {
 					/*
 					 * XXX
 					 * Here we're interested in state
 					 * edges either direction between
 					 * MCAST_UNDEFINED and MCAST_INCLUDE.
 					 * Perhaps we should just check
 					 * the group state, rather than
 					 * the filter mode.
 					 */
 					if (mode == MCAST_UNDEFINED) {
 						type = MLD_BLOCK_OLD_SOURCES;
 					} else {
 						type = MLD_ALLOW_NEW_SOURCES;
 					}
 				} else {
 					type = MLD_CHANGE_TO_INCLUDE_MODE;
 					if (mode == MCAST_UNDEFINED)
 						record_has_sources = 0;
 				}
 			}
 		} else {
 			if (record_has_sources) {
 				is_filter_list_change = 1;
 			} else {
 				type = MLD_DO_NOTHING;
 			}
 		}
 	} else {
 		/*
 		 * Queue a current state record.
 		 */
 		if (mode == MCAST_EXCLUDE) {
 			type = MLD_MODE_IS_EXCLUDE;
 		} else if (mode == MCAST_INCLUDE) {
 			type = MLD_MODE_IS_INCLUDE;
 			KASSERT(inm->in6m_st[1].iss_asm == 0,
 			    ("%s: inm %p is INCLUDE but ASM count is %d",
 			     __func__, inm, inm->in6m_st[1].iss_asm));
 		}
 	}
 
 	/*
 	 * Generate the filter list changes using a separate function.
 	 */
 	if (is_filter_list_change)
 		return (mld_v2_enqueue_filter_change(mq, inm));
 
 	if (type == MLD_DO_NOTHING) {
 		CTR3(KTR_MLD, "%s: nothing to do for %s/%s",
 		    __func__, ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 		    if_name(inm->in6m_ifp));
 		return (0);
 	}
 
 	/*
 	 * If any sources are present, we must be able to fit at least
 	 * one in the trailing space of the tail packet's mbuf,
 	 * ideally more.
 	 */
 	minrec0len = sizeof(struct mldv2_record);
 	if (record_has_sources)
 		minrec0len += sizeof(struct in6_addr);
 
 	CTR4(KTR_MLD, "%s: queueing %s for %s/%s", __func__,
 	    mld_rec_type_to_str(type),
 	    ip6_sprintf(ip6tbuf, &inm->in6m_addr),
 	    if_name(inm->in6m_ifp));
 
 	/*
 	 * Check if we have a packet in the tail of the queue for this
 	 * group into which the first group record for this group will fit.
 	 * Otherwise allocate a new packet.
 	 * Always allocate leading space for IP6+RA+ICMPV6+REPORT.
 	 * Note: Group records for G/GSR query responses MUST be sent
 	 * in their own packet.
 	 */
 	m0 = mbufq_last(mq);
 	if (!is_group_query &&
 	    m0 != NULL &&
 	    (m0->m_pkthdr.vt_nrecs + 1 <= MLD_V2_REPORT_MAXRECS) &&
 	    (m0->m_pkthdr.len + minrec0len) <
 	     (ifp->if_mtu - MLD_MTUSPACE)) {
 		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 			    sizeof(struct mldv2_record)) /
 			    sizeof(struct in6_addr);
 		m = m0;
 		CTR1(KTR_MLD, "%s: use existing packet", __func__);
 	} else {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_MLD, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = NULL;
 		m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 		    sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
 		if (!is_state_change && !is_group_query)
 			m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (-ENOMEM);
 
 		mld_save_context(m, ifp);
 
 		CTR1(KTR_MLD, "%s: allocated first packet", __func__);
 	}
 
 	/*
 	 * Append group record.
 	 * If we have sources, we don't know how many yet.
 	 */
 	mr.mr_type = type;
 	mr.mr_datalen = 0;
 	mr.mr_numsrc = 0;
 	mr.mr_addr = inm->in6m_addr;
 	in6_clearscope(&mr.mr_addr);
 	if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
 		if (m != m0)
 			m_freem(m);
 		CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
 		return (-ENOMEM);
 	}
 	nbytes += sizeof(struct mldv2_record);
 
 	/*
 	 * Append as many sources as will fit in the first packet.
 	 * If we are appending to a new packet, the chain allocation
 	 * may potentially use clusters; use m_getptr() in this case.
 	 * If we are appending to an existing packet, we need to obtain
 	 * a pointer to the group record after m_append(), in case a new
 	 * mbuf was allocated.
 	 *
 	 * Only append sources which are in-mode at t1. If we are
 	 * transitioning to MCAST_UNDEFINED state on the group, and
 	 * use_block_allow is zero, do not include source entries.
 	 * Otherwise, we need to include this source in the report.
 	 *
 	 * Only report recorded sources in our filter set when responding
 	 * to a group-source query.
 	 */
 	if (record_has_sources) {
 		if (m == m0) {
 			md = m_last(m);
 			pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
 			    md->m_len - nbytes);
 		} else {
 			md = m_getptr(m, 0, &off);
 			pmr = (struct mldv2_record *)(mtod(md, uint8_t *) +
 			    off);
 		}
 		msrcs = 0;
 		RB_FOREACH_SAFE(ims, ip6_msource_tree, &inm->in6m_srcs,
 		    nims) {
 			CTR2(KTR_MLD, "%s: visit node %s", __func__,
 			    ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 			now = im6s_get_mode(inm, ims, 1);
 			CTR2(KTR_MLD, "%s: node is %d", __func__, now);
 			if ((now != mode) ||
 			    (now == mode &&
 			     (!use_block_allow && mode == MCAST_UNDEFINED))) {
 				CTR1(KTR_MLD, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->im6s_stp == 0) {
 				CTR1(KTR_MLD, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_MLD, "%s: append node", __func__);
 			if (!m_append(m, sizeof(struct in6_addr),
 			    (void *)&ims->im6s_addr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			nbytes += sizeof(struct in6_addr);
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		CTR2(KTR_MLD, "%s: msrcs is %d this packet", __func__,
 		    msrcs);
 		pmr->mr_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(struct in6_addr));
 	}
 
 	if (is_source_query && msrcs == 0) {
 		CTR1(KTR_MLD, "%s: no recorded sources to report", __func__);
 		if (m != m0)
 			m_freem(m);
 		return (0);
 	}
 
 	/*
 	 * We are good to go with first packet.
 	 */
 	if (m != m0) {
 		CTR1(KTR_MLD, "%s: enqueueing first packet", __func__);
 		m->m_pkthdr.vt_nrecs = 1;
 		mbufq_enqueue(mq, m);
 	} else
 		m->m_pkthdr.vt_nrecs++;
 
 	/*
 	 * No further work needed if no source list in packet(s).
 	 */
 	if (!record_has_sources)
 		return (nbytes);
 
 	/*
 	 * Whilst sources remain to be announced, we need to allocate
 	 * a new packet and fill out as many sources as will fit.
 	 * Always try for a cluster first.
 	 */
 	while (nims != NULL) {
 		if (mbufq_full(mq)) {
 			CTR1(KTR_MLD, "%s: outbound queue full", __func__);
 			return (-ENOMEM);
 		}
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 		if (m == NULL)
 			m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (-ENOMEM);
 		mld_save_context(m, ifp);
 		md = m_getptr(m, 0, &off);
 		pmr = (struct mldv2_record *)(mtod(md, uint8_t *) + off);
 		CTR1(KTR_MLD, "%s: allocated next packet", __func__);
 
 		if (!m_append(m, sizeof(struct mldv2_record), (void *)&mr)) {
 			if (m != m0)
 				m_freem(m);
 			CTR1(KTR_MLD, "%s: m_append() failed.", __func__);
 			return (-ENOMEM);
 		}
 		m->m_pkthdr.vt_nrecs = 1;
 		nbytes += sizeof(struct mldv2_record);
 
 		m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 		    sizeof(struct mldv2_record)) / sizeof(struct in6_addr);
 
 		msrcs = 0;
 		RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
 			CTR2(KTR_MLD, "%s: visit node %s",
 			    __func__, ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 			now = im6s_get_mode(inm, ims, 1);
 			if ((now != mode) ||
 			    (now == mode &&
 			     (!use_block_allow && mode == MCAST_UNDEFINED))) {
 				CTR1(KTR_MLD, "%s: skip node", __func__);
 				continue;
 			}
 			if (is_source_query && ims->im6s_stp == 0) {
 				CTR1(KTR_MLD, "%s: skip unrecorded node",
 				    __func__);
 				continue;
 			}
 			CTR1(KTR_MLD, "%s: append node", __func__);
 			if (!m_append(m, sizeof(struct in6_addr),
 			    (void *)&ims->im6s_addr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD, "%s: m_append() failed.",
 				    __func__);
 				return (-ENOMEM);
 			}
 			++msrcs;
 			if (msrcs == m0srcs)
 				break;
 		}
 		pmr->mr_numsrc = htons(msrcs);
 		nbytes += (msrcs * sizeof(struct in6_addr));
 
 		CTR1(KTR_MLD, "%s: enqueueing next packet", __func__);
 		mbufq_enqueue(mq, m);
 	}
 
 	return (nbytes);
 }
 
 /*
  * Type used to mark record pass completion.
  * We exploit the fact we can cast to this easily from the
  * current filter modes on each ip_msource node.
  */
 typedef enum {
 	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
 	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
 	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
 	REC_FULL = REC_ALLOW | REC_BLOCK
 } rectype_t;
 
 /*
  * Enqueue an MLDv2 filter list change to the given output queue.
  *
  * Source list filter state is held in an RB-tree. When the filter list
  * for a group is changed without changing its mode, we need to compute
  * the deltas between T0 and T1 for each source in the filter set,
  * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
  *
  * As we may potentially queue two record types, and the entire R-B tree
  * needs to be walked at once, we break this out into its own function
  * so we can generate a tightly packed queue of packets.
  *
  * XXX This could be written to only use one tree walk, although that makes
  * serializing into the mbuf chains a bit harder. For now we do two walks
  * which makes things easier on us, and it may or may not be harder on
  * the L2 cache.
  *
  * If successful the size of all data appended to the queue is returned,
  * otherwise an error code less than zero is returned, or zero if
  * no record(s) were appended.
  */
 static int
 mld_v2_enqueue_filter_change(struct mbufq *mq, struct in6_multi *inm)
 {
 	static const int MINRECLEN =
 	    sizeof(struct mldv2_record) + sizeof(struct in6_addr);
 	struct ifnet		*ifp;
 	struct mldv2_record	 mr;
 	struct mldv2_record	*pmr;
 	struct ip6_msource	*ims, *nims;
 	struct mbuf		*m, *m0, *md;
 	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
 	uint8_t			 mode, now, then;
 	rectype_t		 crt, drt, nrt;
 #ifdef KTR
 	int			 nallow, nblock;
 	char			 ip6tbuf[INET6_ADDRSTRLEN];
 #endif
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 
 	if (inm->in6m_nsrc == 0 ||
 	    (inm->in6m_st[0].iss_asm > 0 && inm->in6m_st[1].iss_asm > 0))
 		return (0);
 
 	ifp = inm->in6m_ifp;			/* interface */
 	mode = inm->in6m_st[1].iss_fmode;	/* filter mode at t1 */
 	crt = REC_NONE;	/* current group record type */
 	drt = REC_NONE;	/* mask of completed group record types */
 	nrt = REC_NONE;	/* record type for current node */
 	m0srcs = 0;	/* # source which will fit in current mbuf chain */
 	npbytes = 0;	/* # of bytes appended this packet */
 	nbytes = 0;	/* # of bytes appended to group's state-change queue */
 	rsrcs = 0;	/* # sources encoded in current record */
 	schanged = 0;	/* # nodes encoded in overall filter change */
 #ifdef KTR
 	nallow = 0;	/* # of source entries in ALLOW_NEW */
 	nblock = 0;	/* # of source entries in BLOCK_OLD */
 #endif
 	nims = NULL;	/* next tree node pointer */
 
 	/*
 	 * For each possible filter record mode.
 	 * The first kind of source we encounter tells us which
 	 * is the first kind of record we start appending.
 	 * If a node transitioned to UNDEFINED at t1, its mode is treated
 	 * as the inverse of the group's filter mode.
 	 */
 	while (drt != REC_FULL) {
 		do {
 			m0 = mbufq_last(mq);
 			if (m0 != NULL &&
 			    (m0->m_pkthdr.vt_nrecs + 1 <=
 			     MLD_V2_REPORT_MAXRECS) &&
 			    (m0->m_pkthdr.len + MINRECLEN) <
 			     (ifp->if_mtu - MLD_MTUSPACE)) {
 				m = m0;
 				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
 					    sizeof(struct mldv2_record)) /
 					    sizeof(struct in6_addr);
 				CTR1(KTR_MLD,
 				    "%s: use previous packet", __func__);
 			} else {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 				if (m == NULL)
 					m = m_gethdr(M_NOWAIT, MT_DATA);
 				if (m == NULL) {
 					CTR1(KTR_MLD,
 					    "%s: m_get*() failed", __func__);
 					return (-ENOMEM);
 				}
 				m->m_pkthdr.vt_nrecs = 0;
 				mld_save_context(m, ifp);
 				m0srcs = (ifp->if_mtu - MLD_MTUSPACE -
 				    sizeof(struct mldv2_record)) /
 				    sizeof(struct in6_addr);
 				npbytes = 0;
 				CTR1(KTR_MLD,
 				    "%s: allocated new packet", __func__);
 			}
 			/*
 			 * Append the MLD group record header to the
 			 * current packet's data area.
 			 * Recalculate pointer to free space for next
 			 * group record, in case m_append() allocated
 			 * a new mbuf or cluster.
 			 */
 			memset(&mr, 0, sizeof(mr));
 			mr.mr_addr = inm->in6m_addr;
 			in6_clearscope(&mr.mr_addr);
 			if (!m_append(m, sizeof(mr), (void *)&mr)) {
 				if (m != m0)
 					m_freem(m);
 				CTR1(KTR_MLD,
 				    "%s: m_append() failed", __func__);
 				return (-ENOMEM);
 			}
 			npbytes += sizeof(struct mldv2_record);
 			if (m != m0) {
 				/* new packet; offset in chain */
 				md = m_getptr(m, npbytes -
 				    sizeof(struct mldv2_record), &off);
 				pmr = (struct mldv2_record *)(mtod(md,
 				    uint8_t *) + off);
 			} else {
 				/* current packet; offset from last append */
 				md = m_last(m);
 				pmr = (struct mldv2_record *)(mtod(md,
 				    uint8_t *) + md->m_len -
 				    sizeof(struct mldv2_record));
 			}
 			/*
 			 * Begin walking the tree for this record type
 			 * pass, or continue from where we left off
 			 * previously if we had to allocate a new packet.
 			 * Only report deltas in-mode at t1.
 			 * We need not report included sources as allowed
 			 * if we are in inclusive mode on the group,
 			 * however the converse is not true.
 			 */
 			rsrcs = 0;
 			if (nims == NULL) {
 				nims = RB_MIN(ip6_msource_tree,
 				    &inm->in6m_srcs);
 			}
 			RB_FOREACH_FROM(ims, ip6_msource_tree, nims) {
 				CTR2(KTR_MLD, "%s: visit node %s", __func__,
 				    ip6_sprintf(ip6tbuf, &ims->im6s_addr));
 				now = im6s_get_mode(inm, ims, 1);
 				then = im6s_get_mode(inm, ims, 0);
 				CTR3(KTR_MLD, "%s: mode: t0 %d, t1 %d",
 				    __func__, then, now);
 				if (now == then) {
 					CTR1(KTR_MLD,
 					    "%s: skip unchanged", __func__);
 					continue;
 				}
 				if (mode == MCAST_EXCLUDE &&
 				    now == MCAST_INCLUDE) {
 					CTR1(KTR_MLD,
 					    "%s: skip IN src on EX group",
 					    __func__);
 					continue;
 				}
 				nrt = (rectype_t)now;
 				if (nrt == REC_NONE)
 					nrt = (rectype_t)(~mode & REC_FULL);
 				if (schanged++ == 0) {
 					crt = nrt;
 				} else if (crt != nrt)
 					continue;
 				if (!m_append(m, sizeof(struct in6_addr),
 				    (void *)&ims->im6s_addr)) {
 					if (m != m0)
 						m_freem(m);
 					CTR1(KTR_MLD,
 					    "%s: m_append() failed", __func__);
 					return (-ENOMEM);
 				}
 #ifdef KTR
 				nallow += !!(crt == REC_ALLOW);
 				nblock += !!(crt == REC_BLOCK);
 #endif
 				if (++rsrcs == m0srcs)
 					break;
 			}
 			/*
 			 * If we did not append any tree nodes on this
 			 * pass, back out of allocations.
 			 */
 			if (rsrcs == 0) {
 				npbytes -= sizeof(struct mldv2_record);
 				if (m != m0) {
 					CTR1(KTR_MLD,
 					    "%s: m_free(m)", __func__);
 					m_freem(m);
 				} else {
 					CTR1(KTR_MLD,
 					    "%s: m_adj(m, -mr)", __func__);
 					m_adj(m, -((int)sizeof(
 					    struct mldv2_record)));
 				}
 				continue;
 			}
 			npbytes += (rsrcs * sizeof(struct in6_addr));
 			if (crt == REC_ALLOW)
 				pmr->mr_type = MLD_ALLOW_NEW_SOURCES;
 			else if (crt == REC_BLOCK)
 				pmr->mr_type = MLD_BLOCK_OLD_SOURCES;
 			pmr->mr_numsrc = htons(rsrcs);
 			/*
 			 * Count the new group record, and enqueue this
 			 * packet if it wasn't already queued.
 			 */
 			m->m_pkthdr.vt_nrecs++;
 			if (m != m0)
 				mbufq_enqueue(mq, m);
 			nbytes += npbytes;
 		} while (nims != NULL);
 		drt |= crt;
 		crt = (~crt & REC_FULL);
 	}
 
 	CTR3(KTR_MLD, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
 	    nallow, nblock);
 
 	return (nbytes);
 }
 
 static int
 mld_v2_merge_state_changes(struct in6_multi *inm, struct mbufq *scq)
 {
 	struct mbufq	*gq;
 	struct mbuf	*m;		/* pending state-change */
 	struct mbuf	*m0;		/* copy of pending state-change */
 	struct mbuf	*mt;		/* last state-change in packet */
 	int		 docopy, domerge;
 	u_int		 recslen;
 
 	docopy = 0;
 	domerge = 0;
 	recslen = 0;
 
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	/*
 	 * If there are further pending retransmissions, make a writable
 	 * copy of each queued state-change message before merging.
 	 */
 	if (inm->in6m_scrv > 0)
 		docopy = 1;
 
 	gq = &inm->in6m_scq;
 #ifdef KTR
 	if (mbufq_first(gq) == NULL) {
 		CTR2(KTR_MLD, "%s: WARNING: queue for inm %p is empty",
 		    __func__, inm);
 	}
 #endif
 
 	m = mbufq_first(gq);
 	while (m != NULL) {
 		/*
 		 * Only merge the report into the current packet if
 		 * there is sufficient space to do so; an MLDv2 report
 		 * packet may only contain 65,535 group records.
 		 * Always use a simple mbuf chain concatentation to do this,
 		 * as large state changes for single groups may have
 		 * allocated clusters.
 		 */
 		domerge = 0;
 		mt = mbufq_last(scq);
 		if (mt != NULL) {
 			recslen = m_length(m, NULL);
 
 			if ((mt->m_pkthdr.vt_nrecs +
 			    m->m_pkthdr.vt_nrecs <=
 			    MLD_V2_REPORT_MAXRECS) &&
 			    (mt->m_pkthdr.len + recslen <=
 			    (inm->in6m_ifp->if_mtu - MLD_MTUSPACE)))
 				domerge = 1;
 		}
 
 		if (!domerge && mbufq_full(gq)) {
 			CTR2(KTR_MLD,
 			    "%s: outbound queue full, skipping whole packet %p",
 			    __func__, m);
 			mt = m->m_nextpkt;
 			if (!docopy)
 				m_freem(m);
 			m = mt;
 			continue;
 		}
 
 		if (!docopy) {
 			CTR2(KTR_MLD, "%s: dequeueing %p", __func__, m);
 			m0 = mbufq_dequeue(gq);
 			m = m0->m_nextpkt;
 		} else {
 			CTR2(KTR_MLD, "%s: copying %p", __func__, m);
 			m0 = m_dup(m, M_NOWAIT);
 			if (m0 == NULL)
 				return (ENOMEM);
 			m0->m_nextpkt = NULL;
 			m = m->m_nextpkt;
 		}
 
 		if (!domerge) {
 			CTR3(KTR_MLD, "%s: queueing %p to scq %p)",
 			    __func__, m0, scq);
 			mbufq_enqueue(scq, m0);
 		} else {
 			struct mbuf *mtl;	/* last mbuf of packet mt */
 
 			CTR3(KTR_MLD, "%s: merging %p with ifscq tail %p)",
 			    __func__, m0, mt);
 
 			mtl = m_last(mt);
 			m0->m_flags &= ~M_PKTHDR;
 			mt->m_pkthdr.len += recslen;
 			mt->m_pkthdr.vt_nrecs +=
 			    m0->m_pkthdr.vt_nrecs;
 
 			mtl->m_next = m0;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Respond to a pending MLDv2 General Query.
  */
 static void
 mld_v2_dispatch_general_query(struct mld_ifsoftc *mli)
 {
 	struct ifmultiaddr	*ifma;
 	struct ifnet		*ifp;
 	struct in6_multi	*inm;
 	int			 retval __unused;
 
 	NET_EPOCH_ASSERT();
 	IN6_MULTI_LIST_LOCK_ASSERT();
 	MLD_LOCK_ASSERT();
 
 	KASSERT(mli->mli_version == MLD_VERSION_2,
 	    ("%s: called when version %d", __func__, mli->mli_version));
 
 	/*
 	 * Check that there are some packets queued. If so, send them first.
 	 * For large number of groups the reply to general query can take
 	 * many packets, we should finish sending them before starting of
 	 * queuing the new reply.
 	 */
 	if (mbufq_len(&mli->mli_gq) != 0)
 		goto send;
 
 	ifp = mli->mli_ifp;
 
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		inm = in6m_ifmultiaddr_get_inm(ifma);
 		if (inm == NULL)
 			continue;
 		KASSERT(ifp == inm->in6m_ifp,
 		    ("%s: inconsistent ifp", __func__));
 
 		switch (inm->in6m_state) {
 		case MLD_NOT_MEMBER:
 		case MLD_SILENT_MEMBER:
 			break;
 		case MLD_REPORTING_MEMBER:
 		case MLD_IDLE_MEMBER:
 		case MLD_LAZY_MEMBER:
 		case MLD_SLEEPING_MEMBER:
 		case MLD_AWAKENING_MEMBER:
 			inm->in6m_state = MLD_REPORTING_MEMBER;
 			retval = mld_v2_enqueue_group_record(&mli->mli_gq,
 			    inm, 0, 0, 0, 0);
 			CTR2(KTR_MLD, "%s: enqueue record = %d",
 			    __func__, retval);
 			break;
 		case MLD_G_QUERY_PENDING_MEMBER:
 		case MLD_SG_QUERY_PENDING_MEMBER:
 		case MLD_LEAVING_MEMBER:
 			break;
 		}
 	}
 
 send:
 	mld_dispatch_queue(&mli->mli_gq, MLD_MAX_RESPONSE_BURST);
 
 	/*
 	 * Slew transmission of bursts over 500ms intervals.
 	 */
 	if (mbufq_first(&mli->mli_gq) != NULL) {
 		mli->mli_v2_timer = 1 + MLD_RANDOM_DELAY(
 		    MLD_RESPONSE_BURST_INTERVAL);
 		V_interface_timers_running6 = 1;
 	}
 }
 
 /*
  * Transmit the next pending message in the output queue.
  *
  * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
  * MRT: Nothing needs to be done, as MLD traffic is always local to
  * a link and uses a link-scope multicast address.
  */
 static void
 mld_dispatch_packet(struct mbuf *m)
 {
 	struct ip6_moptions	 im6o;
 	struct ifnet		*ifp;
 	struct ifnet		*oifp;
 	struct mbuf		*m0;
 	struct mbuf		*md;
 	struct ip6_hdr		*ip6;
 	struct mld_hdr		*mld;
 	int			 error;
 	int			 off;
 	int			 type;
 	uint32_t		 ifindex;
 
 	CTR2(KTR_MLD, "%s: transmit %p", __func__, m);
 	NET_EPOCH_ASSERT();
 
 	/*
 	 * Set VNET image pointer from enqueued mbuf chain
 	 * before doing anything else. Whilst we use interface
 	 * indexes to guard against interface detach, they are
 	 * unique to each VIMAGE and must be retrieved.
 	 */
 	ifindex = mld_restore_context(m);
 
 	/*
 	 * Check if the ifnet still exists. This limits the scope of
 	 * any race in the absence of a global ifp lock for low cost
 	 * (an array lookup).
 	 */
 	ifp = ifnet_byindex(ifindex);
 	if (ifp == NULL) {
 		CTR3(KTR_MLD, "%s: dropped %p as ifindex %u went away.",
 		    __func__, m, ifindex);
 		m_freem(m);
 		IP6STAT_INC(ip6s_noroute);
 		goto out;
 	}
 
 	im6o.im6o_multicast_hlim  = 1;
 	im6o.im6o_multicast_loop = (V_ip6_mrouter != NULL);
 	im6o.im6o_multicast_ifp = ifp;
 
 	if (m->m_flags & M_MLDV1) {
 		m0 = m;
 	} else {
 		m0 = mld_v2_encap_report(ifp, m);
 		if (m0 == NULL) {
 			CTR2(KTR_MLD, "%s: dropped %p", __func__, m);
 			IP6STAT_INC(ip6s_odropped);
 			goto out;
 		}
 	}
 
 	mld_scrub_context(m0);
 	m_clrprotoflags(m);
 	m0->m_pkthdr.rcvif = V_loif;
 
 	ip6 = mtod(m0, struct ip6_hdr *);
 #if 0
 	(void)in6_setscope(&ip6->ip6_dst, ifp, NULL);	/* XXX LOR */
 #else
 	/*
 	 * XXX XXX Break some KPI rules to prevent an LOR which would
 	 * occur if we called in6_setscope() at transmission.
 	 * See comments at top of file.
 	 */
 	MLD_EMBEDSCOPE(&ip6->ip6_dst, ifp->if_index);
 #endif
 
 	/*
 	 * Retrieve the ICMPv6 type before handoff to ip6_output(),
 	 * so we can bump the stats.
 	 */
 	md = m_getptr(m0, sizeof(struct ip6_hdr), &off);
 	mld = (struct mld_hdr *)(mtod(md, uint8_t *) + off);
 	type = mld->mld_type;
 
 	oifp = NULL;
 	error = ip6_output(m0, &mld_po, NULL, IPV6_UNSPECSRC, &im6o,
 	    &oifp, NULL);
 	if (error) {
 		CTR3(KTR_MLD, "%s: ip6_output(%p) = %d", __func__, m0, error);
 		goto out;
 	}
 	ICMP6STAT_INC(icp6s_outhist[type]);
 	if (oifp != NULL) {
 		icmp6_ifstat_inc(oifp, ifs6_out_msg);
 		switch (type) {
 		case MLD_LISTENER_REPORT:
 		case MLDV2_LISTENER_REPORT:
 			icmp6_ifstat_inc(oifp, ifs6_out_mldreport);
 			break;
 		case MLD_LISTENER_DONE:
 			icmp6_ifstat_inc(oifp, ifs6_out_mlddone);
 			break;
 		}
 	}
 out:
 	return;
 }
 
 /*
  * Encapsulate an MLDv2 report.
  *
  * KAME IPv6 requires that hop-by-hop options be passed separately,
  * and that the IPv6 header be prepended in a separate mbuf.
  *
  * Returns a pointer to the new mbuf chain head, or NULL if the
  * allocation failed.
  */
 static struct mbuf *
 mld_v2_encap_report(struct ifnet *ifp, struct mbuf *m)
 {
 	struct mbuf		*mh;
 	struct mldv2_report	*mld;
 	struct ip6_hdr		*ip6;
 	struct in6_ifaddr	*ia;
 	int			 mldreclen;
 
 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
 	KASSERT((m->m_flags & M_PKTHDR),
 	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
 
 	/*
 	 * RFC3590: OK to send as :: or tentative during DAD.
 	 */
 	NET_EPOCH_ASSERT();
 	ia = in6ifa_ifpforlinklocal(ifp, IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	if (ia == NULL)
 		CTR1(KTR_MLD, "%s: warning: ia is NULL", __func__);
 
 	mh = m_gethdr(M_NOWAIT, MT_DATA);
 	if (mh == NULL) {
 		if (ia != NULL)
 			ifa_free(&ia->ia_ifa);
 		m_freem(m);
 		return (NULL);
 	}
 	M_ALIGN(mh, sizeof(struct ip6_hdr) + sizeof(struct mldv2_report));
 
 	mldreclen = m_length(m, NULL);
 	CTR2(KTR_MLD, "%s: mldreclen is %d", __func__, mldreclen);
 
 	mh->m_len = sizeof(struct ip6_hdr) + sizeof(struct mldv2_report);
 	mh->m_pkthdr.len = sizeof(struct ip6_hdr) +
 	    sizeof(struct mldv2_report) + mldreclen;
 
 	ip6 = mtod(mh, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any;
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
 	ip6->ip6_dst = in6addr_linklocal_allv2routers;
 	/* scope ID will be set in netisr */
 
 	mld = (struct mldv2_report *)(ip6 + 1);
 	mld->mld_type = MLDV2_LISTENER_REPORT;
 	mld->mld_code = 0;
 	mld->mld_cksum = 0;
 	mld->mld_v2_reserved = 0;
 	mld->mld_v2_numrecs = htons(m->m_pkthdr.vt_nrecs);
 	m->m_pkthdr.vt_nrecs = 0;
 
 	mh->m_next = m;
 	mld->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), sizeof(struct mldv2_report) + mldreclen);
 	return (mh);
 }
 
 #ifdef KTR
 static char *
 mld_rec_type_to_str(const int type)
 {
 
 	switch (type) {
 		case MLD_CHANGE_TO_EXCLUDE_MODE:
 			return "TO_EX";
 			break;
 		case MLD_CHANGE_TO_INCLUDE_MODE:
 			return "TO_IN";
 			break;
 		case MLD_MODE_IS_EXCLUDE:
 			return "MODE_EX";
 			break;
 		case MLD_MODE_IS_INCLUDE:
 			return "MODE_IN";
 			break;
 		case MLD_ALLOW_NEW_SOURCES:
 			return "ALLOW_NEW";
 			break;
 		case MLD_BLOCK_OLD_SOURCES:
 			return "BLOCK_OLD";
 			break;
 		default:
 			break;
 	}
 	return "unknown";
 }
 #endif
 
 static void
 mld_init(void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: initializing", __func__);
 	MLD_LOCK_INIT();
 
 	ip6_initpktopts(&mld_po);
 	mld_po.ip6po_hlim = 1;
 	mld_po.ip6po_hbh = &mld_ra.hbh;
 	mld_po.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER;
 	mld_po.ip6po_flags = IP6PO_DONTFRAG;
 
 	callout_init(&mldslow_callout, 1);
 	callout_reset(&mldslow_callout, hz / MLD_SLOWHZ, mld_slowtimo, NULL);
 	callout_init(&mldfast_callout, 1);
 	callout_reset(&mldfast_callout, hz / MLD_FASTHZ, mld_fasttimo, NULL);
 }
 SYSINIT(mld_init, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_init, NULL);
 
 static void
 mld_uninit(void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: tearing down", __func__);
 	callout_drain(&mldslow_callout);
 	callout_drain(&mldfast_callout);
 	MLD_LOCK_DESTROY();
 }
 SYSUNINIT(mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE, mld_uninit, NULL);
 
 static void
 vnet_mld_init(const void *unused __unused)
 {
 
 	CTR1(KTR_MLD, "%s: initializing", __func__);
 
 	LIST_INIT(&V_mli_head);
 }
 VNET_SYSINIT(vnet_mld_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_init,
     NULL);
 
 static void
 vnet_mld_uninit(const void *unused __unused)
 {
 
 	/* This can happen if we shutdown the network stack. */
 	CTR1(KTR_MLD, "%s: tearing down", __func__);
 }
 VNET_SYSUNINIT(vnet_mld_uninit, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mld_uninit,
     NULL);
 
 static int
 mld_modevent(module_t mod, int type, void *unused __unused)
 {
 
     switch (type) {
     case MOD_LOAD:
     case MOD_UNLOAD:
 	break;
     default:
 	return (EOPNOTSUPP);
     }
     return (0);
 }
 
 static moduledata_t mld_mod = {
     "mld",
     mld_modevent,
     0
 };
 DECLARE_MODULE(mld, mld_mod, SI_SUB_PROTO_MC, SI_ORDER_ANY);
diff --git a/sys/netinet6/nd6.c b/sys/netinet6/nd6.c
index de35127bd17d..082266c84294 100644
--- a/sys/netinet6/nd6.c
+++ b/sys/netinet6/nd6.c
@@ -1,2720 +1,2721 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6.c,v 1.144 2001/05/24 07:44:00 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/callout.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/protosw.h>
 #include <sys/errno.h>
 #include <sys/syslog.h>
 #include <sys/rwlock.h>
 #include <sys/queue.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <net/if_llatbl.h>
 #include <netinet/if_ether.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/icmp6.h>
 #include <netinet6/send.h>
 
 #include <sys/limits.h>
 
 #include <security/mac/mac_framework.h>
 
 #define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */
 #define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */
 
 #define SIN6(s) ((const struct sockaddr_in6 *)(s))
 
 MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");
 
 /* timer values */
 VNET_DEFINE(int, nd6_prune)	= 1;	/* walk list every 1 seconds */
 VNET_DEFINE(int, nd6_delay)	= 5;	/* delay first probe time 5 second */
 VNET_DEFINE(int, nd6_umaxtries)	= 3;	/* maximum unicast query */
 VNET_DEFINE(int, nd6_mmaxtries)	= 3;	/* maximum multicast query */
 VNET_DEFINE(int, nd6_useloopback) = 1;	/* use loopback interface for
 					 * local traffic */
 VNET_DEFINE(int, nd6_gctimer)	= (60 * 60 * 24); /* 1 day: garbage
 					 * collection timer */
 
 /* preventing too many loops in ND option parsing */
 VNET_DEFINE_STATIC(int, nd6_maxndopt) = 10; /* max # of ND options allowed */
 
 VNET_DEFINE(int, nd6_maxnudhint) = 0;	/* max # of subsequent upper
 					 * layer hints */
 VNET_DEFINE_STATIC(int, nd6_maxqueuelen) = 16; /* max pkts cached in unresolved
 					 * ND entries */
 #define	V_nd6_maxndopt			VNET(nd6_maxndopt)
 #define	V_nd6_maxqueuelen		VNET(nd6_maxqueuelen)
 
 #ifdef ND6_DEBUG
 VNET_DEFINE(int, nd6_debug) = 1;
 #else
 VNET_DEFINE(int, nd6_debug) = 0;
 #endif
 
 static eventhandler_tag lle_event_eh, iflladdr_event_eh, ifnet_link_event_eh;
 
 VNET_DEFINE(struct nd_prhead, nd_prefix);
 VNET_DEFINE(struct rwlock, nd6_lock);
 VNET_DEFINE(uint64_t, nd6_list_genid);
 VNET_DEFINE(struct mtx, nd6_onlink_mtx);
 
 VNET_DEFINE(int, nd6_recalc_reachtm_interval) = ND6_RECALC_REACHTM_INTERVAL;
 #define	V_nd6_recalc_reachtm_interval	VNET(nd6_recalc_reachtm_interval)
 
 int	(*send_sendso_input_hook)(struct mbuf *, struct ifnet *, int, int);
 
 static bool nd6_is_new_addr_neighbor(const struct sockaddr_in6 *,
 	struct ifnet *);
 static void nd6_setmtu0(struct ifnet *, struct nd_ifinfo *);
 static void nd6_slowtimo(void *);
 static int regen_tmpaddr(struct in6_ifaddr *);
 static void nd6_free(struct llentry **, int);
 static void nd6_free_redirect(const struct llentry *);
 static void nd6_llinfo_timer(void *);
 static void nd6_llinfo_settimer_locked(struct llentry *, long);
 static int nd6_resolve_slow(struct ifnet *, int, int, struct mbuf *,
     const struct sockaddr_in6 *, u_char *, uint32_t *, struct llentry **);
 static int nd6_need_cache(struct ifnet *);
 
 VNET_DEFINE_STATIC(struct callout, nd6_slowtimo_ch);
 #define	V_nd6_slowtimo_ch		VNET(nd6_slowtimo_ch)
 
 VNET_DEFINE_STATIC(struct callout, nd6_timer_ch);
 #define	V_nd6_timer_ch			VNET(nd6_timer_ch)
 
 SYSCTL_DECL(_net_inet6_icmp6);
 
 static void
 nd6_lle_event(void *arg __unused, struct llentry *lle, int evt)
 {
 	struct rt_addrinfo rtinfo;
 	struct sockaddr_in6 dst;
 	struct sockaddr_dl gw;
 	struct ifnet *ifp;
 	int type;
 	int fibnum;
 
 	LLE_WLOCK_ASSERT(lle);
 
 	if (lltable_get_af(lle->lle_tbl) != AF_INET6)
 		return;
 
 	switch (evt) {
 	case LLENTRY_RESOLVED:
 		type = RTM_ADD;
 		KASSERT(lle->la_flags & LLE_VALID,
 		    ("%s: %p resolved but not valid?", __func__, lle));
 		break;
 	case LLENTRY_EXPIRED:
 		type = RTM_DELETE;
 		break;
 	default:
 		return;
 	}
 
 	ifp = lltable_get_ifp(lle->lle_tbl);
 
 	bzero(&dst, sizeof(dst));
 	bzero(&gw, sizeof(gw));
 	bzero(&rtinfo, sizeof(rtinfo));
 	lltable_fill_sa_entry(lle, (struct sockaddr *)&dst);
 	dst.sin6_scope_id = in6_getscopezone(ifp,
 	    in6_addrscope(&dst.sin6_addr));
 	gw.sdl_len = sizeof(struct sockaddr_dl);
 	gw.sdl_family = AF_LINK;
 	gw.sdl_alen = ifp->if_addrlen;
 	gw.sdl_index = ifp->if_index;
 	gw.sdl_type = ifp->if_type;
 	if (evt == LLENTRY_RESOLVED)
 		bcopy(lle->ll_addr, gw.sdl_data, ifp->if_addrlen);
 	rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst;
 	rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw;
 	rtinfo.rti_addrs = RTA_DST | RTA_GATEWAY;
 	fibnum = V_rt_add_addr_allfibs ? RT_ALL_FIBS : ifp->if_fib;
 	rt_missmsg_fib(type, &rtinfo, RTF_HOST | RTF_LLDATA | (
 	    type == RTM_ADD ? RTF_UP: 0), 0, fibnum);
 }
 
 /*
  * A handler for interface link layer address change event.
  */
 static void
 nd6_iflladdr(void *arg __unused, struct ifnet *ifp)
 {
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 
 	lltable_update_ifaddr(LLTABLE6(ifp));
 }
 
 void
 nd6_init(void)
 {
 
 	mtx_init(&V_nd6_onlink_mtx, "nd6 onlink", NULL, MTX_DEF);
 	rw_init(&V_nd6_lock, "nd6 list");
 
 	LIST_INIT(&V_nd_prefix);
 	nd6_defrouter_init();
 
 	/* Start timers. */
 	callout_init(&V_nd6_slowtimo_ch, 1);
 	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
 	    nd6_slowtimo, curvnet);
 
 	callout_init(&V_nd6_timer_ch, 1);
 	callout_reset(&V_nd6_timer_ch, hz, nd6_timer, curvnet);
 
 	nd6_dad_init();
 	if (IS_DEFAULT_VNET(curvnet)) {
 		lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event,
 		    NULL, EVENTHANDLER_PRI_ANY);
 		iflladdr_event_eh = EVENTHANDLER_REGISTER(iflladdr_event,
 		    nd6_iflladdr, NULL, EVENTHANDLER_PRI_ANY);
 		ifnet_link_event_eh = EVENTHANDLER_REGISTER(ifnet_link_event,
 		    nd6_ifnet_link_event, NULL, EVENTHANDLER_PRI_ANY);
 	}
 }
 
 #ifdef VIMAGE
 void
 nd6_destroy(void)
 {
 
 	callout_drain(&V_nd6_slowtimo_ch);
 	callout_drain(&V_nd6_timer_ch);
 	if (IS_DEFAULT_VNET(curvnet)) {
 		EVENTHANDLER_DEREGISTER(ifnet_link_event, ifnet_link_event_eh);
 		EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh);
 		EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_event_eh);
 	}
 	rw_destroy(&V_nd6_lock);
 	mtx_destroy(&V_nd6_onlink_mtx);
 }
 #endif
 
 struct nd_ifinfo *
 nd6_ifattach(struct ifnet *ifp)
 {
 	struct nd_ifinfo *nd;
 
 	nd = malloc(sizeof(*nd), M_IP6NDP, M_WAITOK | M_ZERO);
 	nd->initialized = 1;
 
 	nd->chlim = IPV6_DEFHLIM;
 	nd->basereachable = REACHABLE_TIME;
 	nd->reachable = ND_COMPUTE_RTIME(nd->basereachable);
 	nd->retrans = RETRANS_TIMER;
 
 	nd->flags = ND6_IFF_PERFORMNUD;
 
 	/* Set IPv6 disabled on all interfaces but loopback by default. */
 	if ((ifp->if_flags & IFF_LOOPBACK) == 0)
 		nd->flags |= ND6_IFF_IFDISABLED;
 
 	/* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL.
 	 * XXXHRS: Clear ND6_IFF_AUTO_LINKLOCAL on an IFT_BRIDGE interface by
 	 * default regardless of the V_ip6_auto_linklocal configuration to
 	 * give a reasonable default behavior.
 	 */
 	if ((V_ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE &&
 	    ifp->if_type != IFT_WIREGUARD) || (ifp->if_flags & IFF_LOOPBACK))
 		nd->flags |= ND6_IFF_AUTO_LINKLOCAL;
 	/*
 	 * A loopback interface does not need to accept RTADV.
 	 * XXXHRS: Clear ND6_IFF_ACCEPT_RTADV on an IFT_BRIDGE interface by
 	 * default regardless of the V_ip6_accept_rtadv configuration to
 	 * prevent the interface from accepting RA messages arrived
 	 * on one of the member interfaces with ND6_IFF_ACCEPT_RTADV.
 	 */
 	if (V_ip6_accept_rtadv &&
 	    !(ifp->if_flags & IFF_LOOPBACK) &&
 	    (ifp->if_type != IFT_BRIDGE)) {
 			nd->flags |= ND6_IFF_ACCEPT_RTADV;
 			/* If we globally accept rtadv, assume IPv6 on. */
 			nd->flags &= ~ND6_IFF_IFDISABLED;
 	}
 	if (V_ip6_no_radr && !(ifp->if_flags & IFF_LOOPBACK))
 		nd->flags |= ND6_IFF_NO_RADR;
 
 	/* XXX: we cannot call nd6_setmtu since ifp is not fully initialized */
 	nd6_setmtu0(ifp, nd);
 
 	return nd;
 }
 
 void
 nd6_ifdetach(struct ifnet *ifp, struct nd_ifinfo *nd)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa, *next;
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, next) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		/* stop DAD processing */
 		nd6_dad_stop(ifa);
 	}
 	NET_EPOCH_EXIT(et);
 
 	free(nd, M_IP6NDP);
 }
 
 /*
  * Reset ND level link MTU. This function is called when the physical MTU
  * changes, which means we might have to adjust the ND level MTU.
  */
 void
 nd6_setmtu(struct ifnet *ifp)
 {
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return;
 
 	nd6_setmtu0(ifp, ND_IFINFO(ifp));
 }
 
 /* XXX todo: do not maintain copy of ifp->if_mtu in ndi->maxmtu */
 void
 nd6_setmtu0(struct ifnet *ifp, struct nd_ifinfo *ndi)
 {
 	u_int32_t omaxmtu;
 
 	omaxmtu = ndi->maxmtu;
 	ndi->maxmtu = ifp->if_mtu;
 
 	/*
 	 * Decreasing the interface MTU under IPV6 minimum MTU may cause
 	 * undesirable situation.  We thus notify the operator of the change
 	 * explicitly.  The check for omaxmtu is necessary to restrict the
 	 * log to the case of changing the MTU, not initializing it.
 	 */
 	if (omaxmtu >= IPV6_MMTU && ndi->maxmtu < IPV6_MMTU) {
 		log(LOG_NOTICE, "nd6_setmtu0: "
 		    "new link MTU on %s (%lu) is too small for IPv6\n",
 		    if_name(ifp), (unsigned long)ndi->maxmtu);
 	}
 
 	if (ndi->maxmtu > V_in6_maxmtu)
 		in6_setmaxmtu(); /* check all interfaces just in case */
 
 }
 
 void
 nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts)
 {
 
 	bzero(ndopts, sizeof(*ndopts));
 	ndopts->nd_opts_search = (struct nd_opt_hdr *)opt;
 	ndopts->nd_opts_last
 		= (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len);
 
 	if (icmp6len == 0) {
 		ndopts->nd_opts_done = 1;
 		ndopts->nd_opts_search = NULL;
 	}
 }
 
 /*
  * Take one ND option.
  */
 struct nd_opt_hdr *
 nd6_option(union nd_opts *ndopts)
 {
 	struct nd_opt_hdr *nd_opt;
 	int olen;
 
 	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
 	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
 	    __func__));
 	if (ndopts->nd_opts_search == NULL)
 		return NULL;
 	if (ndopts->nd_opts_done)
 		return NULL;
 
 	nd_opt = ndopts->nd_opts_search;
 
 	/* make sure nd_opt_len is inside the buffer */
 	if ((caddr_t)&nd_opt->nd_opt_len >= (caddr_t)ndopts->nd_opts_last) {
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	}
 
 	olen = nd_opt->nd_opt_len << 3;
 	if (olen == 0) {
 		/*
 		 * Message validation requires that all included
 		 * options have a length that is greater than zero.
 		 */
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	}
 
 	ndopts->nd_opts_search = (struct nd_opt_hdr *)((caddr_t)nd_opt + olen);
 	if (ndopts->nd_opts_search > ndopts->nd_opts_last) {
 		/* option overruns the end of buffer, invalid */
 		bzero(ndopts, sizeof(*ndopts));
 		return NULL;
 	} else if (ndopts->nd_opts_search == ndopts->nd_opts_last) {
 		/* reached the end of options chain */
 		ndopts->nd_opts_done = 1;
 		ndopts->nd_opts_search = NULL;
 	}
 	return nd_opt;
 }
 
 /*
  * Parse multiple ND options.
  * This function is much easier to use, for ND routines that do not need
  * multiple options of the same type.
  */
 int
 nd6_options(union nd_opts *ndopts)
 {
 	struct nd_opt_hdr *nd_opt;
 	int i = 0;
 
 	KASSERT(ndopts != NULL, ("%s: ndopts == NULL", __func__));
 	KASSERT(ndopts->nd_opts_last != NULL, ("%s: uninitialized ndopts",
 	    __func__));
 	if (ndopts->nd_opts_search == NULL)
 		return 0;
 
 	while (1) {
 		nd_opt = nd6_option(ndopts);
 		if (nd_opt == NULL && ndopts->nd_opts_last == NULL) {
 			/*
 			 * Message validation requires that all included
 			 * options have a length that is greater than zero.
 			 */
 			ICMP6STAT_INC(icp6s_nd_badopt);
 			bzero(ndopts, sizeof(*ndopts));
 			return -1;
 		}
 
 		if (nd_opt == NULL)
 			goto skip1;
 
 		switch (nd_opt->nd_opt_type) {
 		case ND_OPT_SOURCE_LINKADDR:
 		case ND_OPT_TARGET_LINKADDR:
 		case ND_OPT_MTU:
 		case ND_OPT_REDIRECTED_HEADER:
 		case ND_OPT_NONCE:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
 				nd6log((LOG_INFO,
 				    "duplicated ND6 option found (type=%d)\n",
 				    nd_opt->nd_opt_type));
 				/* XXX bark? */
 			} else {
 				ndopts->nd_opt_array[nd_opt->nd_opt_type]
 					= nd_opt;
 			}
 			break;
 		case ND_OPT_PREFIX_INFORMATION:
 			if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) {
 				ndopts->nd_opt_array[nd_opt->nd_opt_type]
 					= nd_opt;
 			}
 			ndopts->nd_opts_pi_end =
 				(struct nd_opt_prefix_info *)nd_opt;
 			break;
 		/* What about ND_OPT_ROUTE_INFO? RFC 4191 */
 		case ND_OPT_RDNSS:	/* RFC 6106 */
 		case ND_OPT_DNSSL:	/* RFC 6106 */
 			/*
 			 * Silently ignore options we know and do not care about
 			 * in the kernel.
 			 */
 			break;
 		default:
 			/*
 			 * Unknown options must be silently ignored,
 			 * to accommodate future extension to the protocol.
 			 */
 			nd6log((LOG_DEBUG,
 			    "nd6_options: unsupported option %d - "
 			    "option ignored\n", nd_opt->nd_opt_type));
 		}
 
 skip1:
 		i++;
 		if (i > V_nd6_maxndopt) {
 			ICMP6STAT_INC(icp6s_nd_toomanyopt);
 			nd6log((LOG_INFO, "too many loop in nd opt\n"));
 			break;
 		}
 
 		if (ndopts->nd_opts_done)
 			break;
 	}
 
 	return 0;
 }
 
 /*
  * ND6 timer routine to handle ND6 entries
  */
 static void
 nd6_llinfo_settimer_locked(struct llentry *ln, long tick)
 {
 	int canceled;
 
 	LLE_WLOCK_ASSERT(ln);
 
 	/* Do not schedule timers for child LLEs. */
 	if (ln->la_flags & LLE_CHILD)
 		return;
 
 	if (tick < 0) {
 		ln->la_expire = 0;
 		ln->ln_ntick = 0;
 		canceled = callout_stop(&ln->lle_timer);
 	} else {
 		ln->la_expire = time_uptime + tick / hz;
 		LLE_ADDREF(ln);
 		if (tick > INT_MAX) {
 			ln->ln_ntick = tick - INT_MAX;
 			canceled = callout_reset(&ln->lle_timer, INT_MAX,
 			    nd6_llinfo_timer, ln);
 		} else {
 			ln->ln_ntick = 0;
 			canceled = callout_reset(&ln->lle_timer, tick,
 			    nd6_llinfo_timer, ln);
 		}
 	}
 	if (canceled > 0)
 		LLE_REMREF(ln);
 }
 
 /*
  * Gets source address of the first packet in hold queue
  * and stores it in @src.
  * Returns pointer to @src (if hold queue is not empty) or NULL.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline struct in6_addr *
 nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src)
 {
 	struct ip6_hdr hdr;
 	struct mbuf *m;
 
 	if (ln->la_hold == NULL)
 		return (NULL);
 
 	/*
 	 * assume every packet in la_hold has the same IP header
 	 */
 	m = ln->la_hold;
 	if (sizeof(hdr) > m->m_len)
 		return (NULL);
 
 	m_copydata(m, 0, sizeof(hdr), (caddr_t)&hdr);
 	*src = hdr.ip6_src;
 
 	return (src);
 }
 
 /*
  * Checks if we need to switch from STALE state.
  *
  * RFC 4861 requires switching from STALE to DELAY state
  * on first packet matching entry, waiting V_nd6_delay and
  * transition to PROBE state (if upper layer confirmation was
  * not received).
  *
  * This code performs a bit differently:
  * On packet hit we don't change state (but desired state
  * can be guessed by control plane). However, after V_nd6_delay
  * seconds code will transition to PROBE state (so DELAY state
  * is kinda skipped in most situations).
  *
  * Typically, V_nd6_gctimer is bigger than V_nd6_delay, so
  * we perform the following upon entering STALE state:
  *
  * 1) Arm timer to run each V_nd6_delay seconds to make sure that
  * if packet was transmitted at the start of given interval, we
  * would be able to switch to PROBE state in V_nd6_delay seconds
  * as user expects.
  *
  * 2) Reschedule timer until original V_nd6_gctimer expires keeping
  * lle in STALE state (remaining timer value stored in lle_remtime).
  *
  * 3) Reschedule timer if packet was transmitted less that V_nd6_delay
  * seconds ago.
  *
  * Returns non-zero value if the entry is still STALE (storing
  * the next timer interval in @pdelay).
  *
  * Returns zero value if original timer expired or we need to switch to
  * PROBE (store that in @do_switch variable).
  */
 static int
 nd6_is_stale(struct llentry *lle, long *pdelay, int *do_switch)
 {
 	int nd_delay, nd_gctimer;
 	time_t lle_hittime;
 	long delay;
 
 	*do_switch = 0;
 	nd_gctimer = V_nd6_gctimer;
 	nd_delay = V_nd6_delay;
 
 	lle_hittime = llentry_get_hittime(lle);
 
 	if (lle_hittime == 0) {
 		/*
 		 * Datapath feedback has been requested upon entering
 		 * STALE state. No packets has been passed using this lle.
 		 * Ask for the timer reschedule and keep STALE state.
 		 */
 		delay = (long)(MIN(nd_gctimer, nd_delay));
 		delay *= hz;
 		if (lle->lle_remtime > delay)
 			lle->lle_remtime -= delay;
 		else {
 			delay = lle->lle_remtime;
 			lle->lle_remtime = 0;
 		}
 
 		if (delay == 0) {
 			/*
 			 * The original ng6_gctime timeout ended,
 			 * no more rescheduling.
 			 */
 			return (0);
 		}
 
 		*pdelay = delay;
 		return (1);
 	}
 
 	/*
 	 * Packet received. Verify timestamp
 	 */
 	delay = (long)(time_uptime - lle_hittime);
 	if (delay < nd_delay) {
 		/*
 		 * V_nd6_delay still not passed since the first
 		 * hit in STALE state.
 		 * Reschedule timer and return.
 		 */
 		*pdelay = (long)(nd_delay - delay) * hz;
 		return (1);
 	}
 
 	/* Request switching to probe */
 	*do_switch = 1;
 	return (0);
 }
 
 /*
  * Switch @lle state to new state optionally arming timers.
  *
  * Set noinline to be dtrace-friendly
  */
 __noinline void
 nd6_llinfo_setstate(struct llentry *lle, int newstate)
 {
 	struct ifnet *ifp;
 	int nd_gctimer, nd_delay;
 	long delay, remtime;
 
 	delay = 0;
 	remtime = 0;
 
 	switch (newstate) {
 	case ND6_LLINFO_INCOMPLETE:
 		ifp = lle->lle_tbl->llt_ifp;
 		delay = (long)ND_IFINFO(ifp)->retrans * hz / 1000;
 		break;
 	case ND6_LLINFO_REACHABLE:
 		if (!ND6_LLINFO_PERMANENT(lle)) {
 			ifp = lle->lle_tbl->llt_ifp;
 			delay = (long)ND_IFINFO(ifp)->reachable * hz;
 		}
 		break;
 	case ND6_LLINFO_STALE:
 
 		llentry_request_feedback(lle);
 		nd_delay = V_nd6_delay;
 		nd_gctimer = V_nd6_gctimer;
 
 		delay = (long)(MIN(nd_gctimer, nd_delay)) * hz;
 		remtime = (long)nd_gctimer * hz - delay;
 		break;
 	case ND6_LLINFO_DELAY:
 		lle->la_asked = 0;
 		delay = (long)V_nd6_delay * hz;
 		break;
 	}
 
 	if (delay > 0)
 		nd6_llinfo_settimer_locked(lle, delay);
 
 	lle->lle_remtime = remtime;
 	lle->ln_state = newstate;
 }
 
 /*
  * Timer-dependent part of nd state machine.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline void
 nd6_llinfo_timer(void *arg)
 {
 	struct epoch_tracker et;
 	struct llentry *ln;
 	struct in6_addr *dst, *pdst, *psrc, src;
 	struct ifnet *ifp;
 	struct nd_ifinfo *ndi;
 	int do_switch, send_ns;
 	long delay;
 
 	KASSERT(arg != NULL, ("%s: arg NULL", __func__));
 	ln = (struct llentry *)arg;
 	ifp = lltable_get_ifp(ln->lle_tbl);
 	CURVNET_SET(ifp->if_vnet);
 
 	ND6_RLOCK();
 	LLE_WLOCK(ln);
 	if (callout_pending(&ln->lle_timer)) {
 		/*
 		 * Here we are a bit odd here in the treatment of 
 		 * active/pending. If the pending bit is set, it got
 		 * rescheduled before I ran. The active
 		 * bit we ignore, since if it was stopped
 		 * in ll_tablefree() and was currently running
 		 * it would have return 0 so the code would
 		 * not have deleted it since the callout could
 		 * not be stopped so we want to go through
 		 * with the delete here now. If the callout
 		 * was restarted, the pending bit will be back on and
 		 * we just want to bail since the callout_reset would
 		 * return 1 and our reference would have been removed
 		 * by nd6_llinfo_settimer_locked above since canceled
 		 * would have been 1.
 		 */
 		LLE_WUNLOCK(ln);
 		ND6_RUNLOCK();
 		CURVNET_RESTORE();
 		return;
 	}
 	NET_EPOCH_ENTER(et);
 	ndi = ND_IFINFO(ifp);
 	send_ns = 0;
 	dst = &ln->r_l3addr.addr6;
 	pdst = dst;
 
 	if (ln->ln_ntick > 0) {
 		if (ln->ln_ntick > INT_MAX) {
 			ln->ln_ntick -= INT_MAX;
 			nd6_llinfo_settimer_locked(ln, INT_MAX);
 		} else {
 			ln->ln_ntick = 0;
 			nd6_llinfo_settimer_locked(ln, ln->ln_ntick);
 		}
 		goto done;
 	}
 
 	if (ln->la_flags & LLE_STATIC) {
 		goto done;
 	}
 
 	if (ln->la_flags & LLE_DELETED) {
 		nd6_free(&ln, 0);
 		goto done;
 	}
 
 	switch (ln->ln_state) {
 	case ND6_LLINFO_INCOMPLETE:
 		if (ln->la_asked < V_nd6_mmaxtries) {
 			ln->la_asked++;
 			send_ns = 1;
 			/* Send NS to multicast address */
 			pdst = NULL;
 		} else {
 			struct mbuf *m;
 
 			ICMP6STAT_ADD(icp6s_dropped, ln->la_numheld);
 
 			m = ln->la_hold;
 			if (m != NULL) {
 				/*
 				 * assuming every packet in la_hold has the
 				 * same IP header.  Send error after unlock.
 				 */
 				ln->la_hold = m->m_nextpkt;
 				m->m_nextpkt = NULL;
 				ln->la_numheld--;
 			}
 			nd6_free(&ln, 0);
 			if (m != NULL) {
 				struct mbuf *n = m;
 
 				/*
 				 * if there are any ummapped mbufs, we
 				 * must free them, rather than using
 				 * them for an ICMP, as they cannot be
 				 * checksummed.
 				 */
 				while ((n = n->m_next) != NULL) {
 					if (n->m_flags & M_EXTPG)
 						break;
 				}
 				if (n != NULL) {
 					m_freem(m);
 					m = NULL;
 				} else {
 					icmp6_error2(m, ICMP6_DST_UNREACH,
 					    ICMP6_DST_UNREACH_ADDR, 0, ifp);
 				}
 			}
 		}
 		break;
 	case ND6_LLINFO_REACHABLE:
 		if (!ND6_LLINFO_PERMANENT(ln))
 			nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 		break;
 
 	case ND6_LLINFO_STALE:
 		if (nd6_is_stale(ln, &delay, &do_switch) != 0) {
 			/*
 			 * No packet has used this entry and GC timeout
 			 * has not been passed. Reschedule timer and
 			 * return.
 			 */
 			nd6_llinfo_settimer_locked(ln, delay);
 			break;
 		}
 
 		if (do_switch == 0) {
 			/*
 			 * GC timer has ended and entry hasn't been used.
 			 * Run Garbage collector (RFC 4861, 5.3)
 			 */
 			if (!ND6_LLINFO_PERMANENT(ln))
 				nd6_free(&ln, 1);
 			break;
 		}
 
 		/* Entry has been used AND delay timer has ended. */
 
 		/* FALLTHROUGH */
 
 	case ND6_LLINFO_DELAY:
 		if (ndi && (ndi->flags & ND6_IFF_PERFORMNUD) != 0) {
 			/* We need NUD */
 			ln->la_asked = 1;
 			nd6_llinfo_setstate(ln, ND6_LLINFO_PROBE);
 			send_ns = 1;
 		} else
 			nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); /* XXX */
 		break;
 	case ND6_LLINFO_PROBE:
 		if (ln->la_asked < V_nd6_umaxtries) {
 			ln->la_asked++;
 			send_ns = 1;
 		} else {
 			nd6_free(&ln, 0);
 		}
 		break;
 	default:
 		panic("%s: paths in a dark night can be confusing: %d",
 		    __func__, ln->ln_state);
 	}
 done:
 	if (ln != NULL)
 		ND6_RUNLOCK();
 	if (send_ns != 0) {
 		nd6_llinfo_settimer_locked(ln, (long)ndi->retrans * hz / 1000);
 		psrc = nd6_llinfo_get_holdsrc(ln, &src);
 		LLE_FREE_LOCKED(ln);
 		ln = NULL;
 		nd6_ns_output(ifp, psrc, pdst, dst, NULL);
 	}
 
 	if (ln != NULL)
 		LLE_FREE_LOCKED(ln);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 /*
  * ND6 timer routine to expire default route list and prefix list
  */
 void
 nd6_timer(void *arg)
 {
 	CURVNET_SET((struct vnet *) arg);
 	struct epoch_tracker et;
 	struct nd_prhead prl;
 	struct nd_prefix *pr, *npr;
 	struct ifnet *ifp;
 	struct in6_ifaddr *ia6, *nia6;
 	uint64_t genid;
 
 	LIST_INIT(&prl);
 
 	NET_EPOCH_ENTER(et);
 	nd6_defrouter_timer();
 
 	/*
 	 * expire interface addresses.
 	 * in the past the loop was inside prefix expiry processing.
 	 * However, from a stricter speci-confrmance standpoint, we should
 	 * rather separate address lifetimes and prefix lifetimes.
 	 *
 	 * XXXRW: in6_ifaddrhead locking.
 	 */
   addrloop:
 	CK_STAILQ_FOREACH_SAFE(ia6, &V_in6_ifaddrhead, ia_link, nia6) {
 		/* check address lifetime */
 		if (IFA6_IS_INVALID(ia6)) {
 			int regen = 0;
 
 			/*
 			 * If the expiring address is temporary, try
 			 * regenerating a new one.  This would be useful when
 			 * we suspended a laptop PC, then turned it on after a
 			 * period that could invalidate all temporary
 			 * addresses.  Although we may have to restart the
 			 * loop (see below), it must be after purging the
 			 * address.  Otherwise, we'd see an infinite loop of
 			 * regeneration.
 			 */
 			if (V_ip6_use_tempaddr &&
 			    (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
 				if (regen_tmpaddr(ia6) == 0)
 					regen = 1;
 			}
 
 			in6_purgeaddr(&ia6->ia_ifa);
 
 			if (regen)
 				goto addrloop; /* XXX: see below */
 		} else if (IFA6_IS_DEPRECATED(ia6)) {
 			int oldflags = ia6->ia6_flags;
 
 			ia6->ia6_flags |= IN6_IFF_DEPRECATED;
 
 			/*
 			 * If a temporary address has just become deprecated,
 			 * regenerate a new one if possible.
 			 */
 			if (V_ip6_use_tempaddr &&
 			    (ia6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 			    (oldflags & IN6_IFF_DEPRECATED) == 0) {
 				if (regen_tmpaddr(ia6) == 0) {
 					/*
 					 * A new temporary address is
 					 * generated.
 					 * XXX: this means the address chain
 					 * has changed while we are still in
 					 * the loop.  Although the change
 					 * would not cause disaster (because
 					 * it's not a deletion, but an
 					 * addition,) we'd rather restart the
 					 * loop just for safety.  Or does this
 					 * significantly reduce performance??
 					 */
 					goto addrloop;
 				}
 			}
 		} else if ((ia6->ia6_flags & IN6_IFF_TENTATIVE) != 0) {
 			/*
 			 * Schedule DAD for a tentative address.  This happens
 			 * if the interface was down or not running
 			 * when the address was configured.
 			 */
 			int delay;
 
 			delay = arc4random() %
 			    (MAX_RTR_SOLICITATION_DELAY * hz);
 			nd6_dad_start((struct ifaddr *)ia6, delay);
 		} else {
 			/*
 			 * Check status of the interface.  If it is down,
 			 * mark the address as tentative for future DAD.
 			 */
 			ifp = ia6->ia_ifp;
 			if ((ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0 &&
 			    ((ifp->if_flags & IFF_UP) == 0 ||
 			    (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 			    (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0)){
 				ia6->ia6_flags &= ~IN6_IFF_DUPLICATED;
 				ia6->ia6_flags |= IN6_IFF_TENTATIVE;
 			}
 
 			/*
 			 * A new RA might have made a deprecated address
 			 * preferred.
 			 */
 			ia6->ia6_flags &= ~IN6_IFF_DEPRECATED;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 
 	ND6_WLOCK();
 restart:
 	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
 		/*
 		 * Expire prefixes. Since the pltime is only used for
 		 * autoconfigured addresses, pltime processing for prefixes is
 		 * not necessary.
 		 *
 		 * Only unlink after all derived addresses have expired. This
 		 * may not occur until two hours after the prefix has expired
 		 * per RFC 4862. If the prefix expires before its derived
 		 * addresses, mark it off-link. This will be done automatically
 		 * after unlinking if no address references remain.
 		 */
 		if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME ||
 		    time_uptime - pr->ndpr_lastupdate <= pr->ndpr_vltime)
 			continue;
 
 		if (pr->ndpr_addrcnt == 0) {
 			nd6_prefix_unlink(pr, &prl);
 			continue;
 		}
 		if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
 			genid = V_nd6_list_genid;
 			nd6_prefix_ref(pr);
 			ND6_WUNLOCK();
 			ND6_ONLINK_LOCK();
 			(void)nd6_prefix_offlink(pr);
 			ND6_ONLINK_UNLOCK();
 			ND6_WLOCK();
 			nd6_prefix_rele(pr);
 			if (genid != V_nd6_list_genid)
 				goto restart;
 		}
 	}
 	ND6_WUNLOCK();
 
 	while ((pr = LIST_FIRST(&prl)) != NULL) {
 		LIST_REMOVE(pr, ndpr_entry);
 		nd6_prefix_del(pr);
 	}
 
 	callout_reset(&V_nd6_timer_ch, V_nd6_prune * hz,
 	    nd6_timer, curvnet);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * ia6 - deprecated/invalidated temporary address
  */
 static int
 regen_tmpaddr(struct in6_ifaddr *ia6)
 {
 	struct ifaddr *ifa;
 	struct ifnet *ifp;
 	struct in6_ifaddr *public_ifa6 = NULL;
 
 	NET_EPOCH_ASSERT();
 
 	ifp = ia6->ia_ifa.ifa_ifp;
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in6_ifaddr *it6;
 
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		it6 = (struct in6_ifaddr *)ifa;
 
 		/* ignore no autoconf addresses. */
 		if ((it6->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 			continue;
 
 		/* ignore autoconf addresses with different prefixes. */
 		if (it6->ia6_ndpr == NULL || it6->ia6_ndpr != ia6->ia6_ndpr)
 			continue;
 
 		/*
 		 * Now we are looking at an autoconf address with the same
 		 * prefix as ours.  If the address is temporary and is still
 		 * preferred, do not create another one.  It would be rare, but
 		 * could happen, for example, when we resume a laptop PC after
 		 * a long period.
 		 */
 		if ((it6->ia6_flags & IN6_IFF_TEMPORARY) != 0 &&
 		    !IFA6_IS_DEPRECATED(it6)) {
 			public_ifa6 = NULL;
 			break;
 		}
 
 		/*
 		 * This is a public autoconf address that has the same prefix
 		 * as ours.  If it is preferred, keep it.  We can't break the
 		 * loop here, because there may be a still-preferred temporary
 		 * address with the prefix.
 		 */
 		if (!IFA6_IS_DEPRECATED(it6))
 			public_ifa6 = it6;
 	}
 	if (public_ifa6 != NULL)
 		ifa_ref(&public_ifa6->ia_ifa);
 
 	if (public_ifa6 != NULL) {
 		int e;
 
 		if ((e = in6_tmpifadd(public_ifa6, 0, 0)) != 0) {
 			ifa_free(&public_ifa6->ia_ifa);
 			log(LOG_NOTICE, "regen_tmpaddr: failed to create a new"
 			    " tmp addr,errno=%d\n", e);
 			return (-1);
 		}
 		ifa_free(&public_ifa6->ia_ifa);
 		return (0);
 	}
 
 	return (-1);
 }
 
 /*
  * Remove prefix and default router list entries corresponding to ifp. Neighbor
  * cache entries are freed in in6_domifdetach().
  */
 void
 nd6_purge(struct ifnet *ifp)
 {
 	struct nd_prhead prl;
 	struct nd_prefix *pr, *npr;
 
 	LIST_INIT(&prl);
 
 	/* Purge default router list entries toward ifp. */
 	nd6_defrouter_purge(ifp);
 
 	ND6_WLOCK();
 	/*
 	 * Remove prefixes on ifp. We should have already removed addresses on
 	 * this interface, so no addresses should be referencing these prefixes.
 	 */
 	LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, npr) {
 		if (pr->ndpr_ifp == ifp)
 			nd6_prefix_unlink(pr, &prl);
 	}
 	ND6_WUNLOCK();
 
 	/* Delete the unlinked prefix objects. */
 	while ((pr = LIST_FIRST(&prl)) != NULL) {
 		LIST_REMOVE(pr, ndpr_entry);
 		nd6_prefix_del(pr);
 	}
 
 	/* cancel default outgoing interface setting */
 	if (V_nd6_defifindex == ifp->if_index)
 		nd6_setdefaultiface(0);
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		/* Refresh default router list. */
 		defrouter_select_fib(ifp->if_fib);
 	}
 }
 
 /* 
  * the caller acquires and releases the lock on the lltbls
  * Returns the llentry locked
  */
 struct llentry *
 nd6_lookup(const struct in6_addr *addr6, int flags, struct ifnet *ifp)
 {
 	struct sockaddr_in6 sin6;
 	struct llentry *ln;
 
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_addr = *addr6;
 
 	IF_AFDATA_LOCK_ASSERT(ifp);
 
 	ln = lla_lookup(LLTABLE6(ifp), flags, (struct sockaddr *)&sin6);
 
 	return (ln);
 }
 
 static struct llentry *
 nd6_alloc(const struct in6_addr *addr6, int flags, struct ifnet *ifp)
 {
 	struct sockaddr_in6 sin6;
 	struct llentry *ln;
 
 	bzero(&sin6, sizeof(sin6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_addr = *addr6;
 
 	ln = lltable_alloc_entry(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6);
 	if (ln != NULL)
 		ln->ln_state = ND6_LLINFO_NOSTATE;
 
 	return (ln);
 }
 
 /*
  * Test whether a given IPv6 address can be a neighbor.
  */
 static bool
 nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp)
 {
 
 	/*
 	 * A link-local address is always a neighbor.
 	 * XXX: a link does not necessarily specify a single interface.
 	 */
 	if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) {
 		struct sockaddr_in6 sin6_copy;
 		u_int32_t zone;
 
 		/*
 		 * We need sin6_copy since sa6_recoverscope() may modify the
 		 * content (XXX).
 		 */
 		sin6_copy = *addr;
 		if (sa6_recoverscope(&sin6_copy))
 			return (0); /* XXX: should be impossible */
 		if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone))
 			return (0);
 		if (sin6_copy.sin6_scope_id == zone)
 			return (1);
 		else
 			return (0);
 	}
 	/* Checking global unicast */
 
 	/* If an address is directly reachable, it is a neigbor */
 	struct nhop_object *nh;
 	nh = fib6_lookup(ifp->if_fib, &addr->sin6_addr, 0, NHR_NONE, 0);
 	if (nh != NULL && nh->nh_aifp == ifp && (nh->nh_flags & NHF_GATEWAY) == 0)
 		return (true);
 
 	/*
 	 * Check prefixes with desired on-link state, as some may be not
 	 * installed in the routing table.
 	 */
 	bool matched = false;
 	struct nd_prefix *pr;
 	ND6_RLOCK();
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		if (pr->ndpr_ifp != ifp)
 			continue;
 		if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0)
 			continue;
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&pr->ndpr_prefix.sin6_addr,
 		    &addr->sin6_addr, &pr->ndpr_mask)) {
 			matched = true;
 			break;
 		}
 	}
 	ND6_RUNLOCK();
 	if (matched)
 		return (true);
 
 	/*
 	 * If the address is assigned on the node of the other side of
 	 * a p2p interface, the address should be a neighbor.
 	 */
 	if (ifp->if_flags & IFF_POINTOPOINT) {
 		struct ifaddr *ifa;
 
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sin6_family)
 				continue;
 			if (ifa->ifa_dstaddr != NULL &&
 			    sa_equal(addr, ifa->ifa_dstaddr)) {
 				return (true);
 			}
 		}
 	}
 
 	/*
 	 * If the default router list is empty, all addresses are regarded
 	 * as on-link, and thus, as a neighbor.
 	 */
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV &&
 	    nd6_defrouter_list_empty() &&
 	    V_nd6_defifindex == ifp->if_index) {
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Detect if a given IPv6 address identifies a neighbor on a given link.
  * XXX: should take care of the destination of a p2p link?
  */
 int
 nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp)
 {
 	struct llentry *lle;
 	int rc = 0;
 
 	NET_EPOCH_ASSERT();
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 	if (nd6_is_new_addr_neighbor(addr, ifp))
 		return (1);
 
 	/*
 	 * Even if the address matches none of our addresses, it might be
 	 * in the neighbor cache.
 	 */
 	if ((lle = nd6_lookup(&addr->sin6_addr, LLE_SF(AF_INET6, 0), ifp)) != NULL) {
 		LLE_RUNLOCK(lle);
 		rc = 1;
 	}
 	return (rc);
 }
 
 static __noinline void
 nd6_free_children(struct llentry *lle)
 {
 	struct llentry *child_lle;
 
 	NET_EPOCH_ASSERT();
 	LLE_WLOCK_ASSERT(lle);
 
 	while ((child_lle = CK_SLIST_FIRST(&lle->lle_children)) != NULL) {
 		LLE_WLOCK(child_lle);
 		lltable_unlink_child_entry(child_lle);
 		llentry_free(child_lle);
 	}
 }
 
 /*
  * Tries to update @lle address/prepend data with new @lladdr.
  *
  * Returns true on success.
  * In any case, @lle is returned wlocked.
  */
 static __noinline bool
 nd6_try_set_entry_addr_locked(struct ifnet *ifp, struct llentry *lle, char *lladdr)
 {
 	u_char buf[LLE_MAX_LINKHDR];
 	int fam, off;
 	size_t sz;
 
 	sz = sizeof(buf);
 	if (lltable_calc_llheader(ifp, AF_INET6, lladdr, buf, &sz, &off) != 0)
 		return (false);
 
 	/* Update data */
 	lltable_set_entry_addr(ifp, lle, buf, sz, off);
 
 	struct llentry *child_lle;
 	CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) {
 		LLE_WLOCK(child_lle);
 		fam = child_lle->r_family;
 		sz = sizeof(buf);
 		if (lltable_calc_llheader(ifp, fam, lladdr, buf, &sz, &off) == 0) {
 			/* success */
 			lltable_set_entry_addr(ifp, child_lle, buf, sz, off);
 			child_lle->ln_state = ND6_LLINFO_REACHABLE;
 		}
 		LLE_WUNLOCK(child_lle);
 	}
 
 	return (true);
 }
 
 bool
 nd6_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle, char *lladdr)
 {
 	NET_EPOCH_ASSERT();
 	LLE_WLOCK_ASSERT(lle);
 
 	if (!lltable_acquire_wlock(ifp, lle))
 		return (false);
 	bool ret = nd6_try_set_entry_addr_locked(ifp, lle, lladdr);
 	IF_AFDATA_WUNLOCK(ifp);
 
 	return (ret);
 }
 
 /*
  * Free an nd6 llinfo entry.
  * Since the function would cause significant changes in the kernel, DO NOT
  * make it global, unless you have a strong reason for the change, and are sure
  * that the change is safe.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline void
 nd6_free(struct llentry **lnp, int gc)
 {
 	struct ifnet *ifp;
 	struct llentry *ln;
 	struct nd_defrouter *dr;
 
 	ln = *lnp;
 	*lnp = NULL;
 
 	LLE_WLOCK_ASSERT(ln);
 	ND6_RLOCK_ASSERT();
 
 	KASSERT((ln->la_flags & LLE_CHILD) == 0, ("child lle"));
 
 	ifp = lltable_get_ifp(ln->lle_tbl);
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) != 0)
 		dr = defrouter_lookup_locked(&ln->r_l3addr.addr6, ifp);
 	else
 		dr = NULL;
 	ND6_RUNLOCK();
 
 	if ((ln->la_flags & LLE_DELETED) == 0)
 		EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_EXPIRED);
 
 	/*
 	 * we used to have pfctlinput(PRC_HOSTDEAD) here.
 	 * even though it is not harmful, it was not really necessary.
 	 */
 
 	/* cancel timer */
 	nd6_llinfo_settimer_locked(ln, -1);
 
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		if (dr != NULL && dr->expire &&
 		    ln->ln_state == ND6_LLINFO_STALE && gc) {
 			/*
 			 * If the reason for the deletion is just garbage
 			 * collection, and the neighbor is an active default
 			 * router, do not delete it.  Instead, reset the GC
 			 * timer using the router's lifetime.
 			 * Simply deleting the entry would affect default
 			 * router selection, which is not necessarily a good
 			 * thing, especially when we're using router preference
 			 * values.
 			 * XXX: the check for ln_state would be redundant,
 			 *      but we intentionally keep it just in case.
 			 */
 			if (dr->expire > time_uptime)
 				nd6_llinfo_settimer_locked(ln,
 				    (dr->expire - time_uptime) * hz);
 			else
 				nd6_llinfo_settimer_locked(ln,
 				    (long)V_nd6_gctimer * hz);
 
 			LLE_REMREF(ln);
 			LLE_WUNLOCK(ln);
 			defrouter_rele(dr);
 			return;
 		}
 
 		if (dr) {
 			/*
 			 * Unreachability of a router might affect the default
 			 * router selection and on-link detection of advertised
 			 * prefixes.
 			 */
 
 			/*
 			 * Temporarily fake the state to choose a new default
 			 * router and to perform on-link determination of
 			 * prefixes correctly.
 			 * Below the state will be set correctly,
 			 * or the entry itself will be deleted.
 			 */
 			ln->ln_state = ND6_LLINFO_INCOMPLETE;
 		}
 
 		if (ln->ln_router || dr) {
 			/*
 			 * We need to unlock to avoid a LOR with rt6_flush() with the
 			 * rnh and for the calls to pfxlist_onlink_check() and
 			 * defrouter_select_fib() in the block further down for calls
 			 * into nd6_lookup().  We still hold a ref.
 			 */
 			LLE_WUNLOCK(ln);
 
 			/*
 			 * rt6_flush must be called whether or not the neighbor
 			 * is in the Default Router List.
 			 * See a corresponding comment in nd6_na_input().
 			 */
 			rt6_flush(&ln->r_l3addr.addr6, ifp);
 		}
 
 		if (dr) {
 			/*
 			 * Since defrouter_select_fib() does not affect the
 			 * on-link determination and MIP6 needs the check
 			 * before the default router selection, we perform
 			 * the check now.
 			 */
 			pfxlist_onlink_check();
 
 			/*
 			 * Refresh default router list.
 			 */
 			defrouter_select_fib(dr->ifp->if_fib);
 		}
 
 		/*
 		 * If this entry was added by an on-link redirect, remove the
 		 * corresponding host route.
 		 */
 		if (ln->la_flags & LLE_REDIRECT)
 			nd6_free_redirect(ln);
 
 		if (ln->ln_router || dr)
 			LLE_WLOCK(ln);
 	}
 
 	/*
 	 * Save to unlock. We still hold an extra reference and will not
 	 * free(9) in llentry_free() if someone else holds one as well.
 	 */
 	LLE_WUNLOCK(ln);
 	IF_AFDATA_LOCK(ifp);
 	LLE_WLOCK(ln);
 	/* Guard against race with other llentry_free(). */
 	if (ln->la_flags & LLE_LINKED) {
 		/* Remove callout reference */
 		LLE_REMREF(ln);
 		lltable_unlink_entry(ln->lle_tbl, ln);
 	}
 	IF_AFDATA_UNLOCK(ifp);
 
 	nd6_free_children(ln);
 
 	llentry_free(ln);
 	if (dr != NULL)
 		defrouter_rele(dr);
 }
 
 static int
 nd6_isdynrte(const struct rtentry *rt, const struct nhop_object *nh, void *xap)
 {
 
 	if (nh->nh_flags & NHF_REDIRECT)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Remove the rtentry for the given llentry,
  * both of which were installed by a redirect.
  */
 static void
 nd6_free_redirect(const struct llentry *ln)
 {
 	int fibnum;
 	struct sockaddr_in6 sin6;
 	struct rib_cmd_info rc;
 	struct epoch_tracker et;
 
 	lltable_fill_sa_entry(ln, (struct sockaddr *)&sin6);
 
 	NET_EPOCH_ENTER(et);
 	for (fibnum = 0; fibnum < rt_numfibs; fibnum++)
 		rib_del_route_px(fibnum, (struct sockaddr *)&sin6, 128,
 		    nd6_isdynrte, NULL, 0, &rc);
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * Updates status of the default router route.
  */
 static void
 check_release_defrouter(const struct rib_cmd_info *rc, void *_cbdata)
 {
 	struct nd_defrouter *dr;
 	struct nhop_object *nh;
 
 	nh = rc->rc_nh_old;
 
 	if ((nh != NULL) && (nh->nh_flags & NHF_DEFAULT)) {
 		dr = defrouter_lookup(&nh->gw6_sa.sin6_addr, nh->nh_ifp);
 		if (dr != NULL) {
 			dr->installed = 0;
 			defrouter_rele(dr);
 		}
 	}
 }
 
 void
 nd6_subscription_cb(struct rib_head *rnh, struct rib_cmd_info *rc, void *arg)
 {
 
 #ifdef ROUTE_MPATH
 	rib_decompose_notification(rc, check_release_defrouter, NULL);
 #else
 	check_release_defrouter(rc, NULL);
 #endif
 }
 
 int
 nd6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
 {
 	struct in6_ndireq *ndi = (struct in6_ndireq *)data;
 	struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data;
 	struct in6_ndifreq *ndif = (struct in6_ndifreq *)data;
 	struct epoch_tracker et;
 	int error = 0;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return (EPFNOSUPPORT);
 	switch (cmd) {
 	case OSIOCGIFINFO_IN6:
 #define ND	ndi->ndi
 		/* XXX: old ndp(8) assumes a positive value for linkmtu. */
 		bzero(&ND, sizeof(ND));
 		ND.linkmtu = IN6_LINKMTU(ifp);
 		ND.maxmtu = ND_IFINFO(ifp)->maxmtu;
 		ND.basereachable = ND_IFINFO(ifp)->basereachable;
 		ND.reachable = ND_IFINFO(ifp)->reachable;
 		ND.retrans = ND_IFINFO(ifp)->retrans;
 		ND.flags = ND_IFINFO(ifp)->flags;
 		ND.recalctm = ND_IFINFO(ifp)->recalctm;
 		ND.chlim = ND_IFINFO(ifp)->chlim;
 		break;
 	case SIOCGIFINFO_IN6:
 		ND = *ND_IFINFO(ifp);
 		break;
 	case SIOCSIFINFO_IN6:
 		/*
 		 * used to change host variables from userland.
 		 * intended for a use on router to reflect RA configurations.
 		 */
 		/* 0 means 'unspecified' */
 		if (ND.linkmtu != 0) {
 			if (ND.linkmtu < IPV6_MMTU ||
 			    ND.linkmtu > IN6_LINKMTU(ifp)) {
 				error = EINVAL;
 				break;
 			}
 			ND_IFINFO(ifp)->linkmtu = ND.linkmtu;
 		}
 
 		if (ND.basereachable != 0) {
 			int obasereachable = ND_IFINFO(ifp)->basereachable;
 
 			ND_IFINFO(ifp)->basereachable = ND.basereachable;
 			if (ND.basereachable != obasereachable)
 				ND_IFINFO(ifp)->reachable =
 				    ND_COMPUTE_RTIME(ND.basereachable);
 		}
 		if (ND.retrans != 0)
 			ND_IFINFO(ifp)->retrans = ND.retrans;
 		if (ND.chlim != 0)
 			ND_IFINFO(ifp)->chlim = ND.chlim;
 		/* FALLTHROUGH */
 	case SIOCSIFINFO_FLAGS:
 	{
 		struct ifaddr *ifa;
 		struct in6_ifaddr *ia;
 
 		if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 		    !(ND.flags & ND6_IFF_IFDISABLED)) {
 			/* ifdisabled 1->0 transision */
 
 			/*
 			 * If the interface is marked as ND6_IFF_IFDISABLED and
 			 * has an link-local address with IN6_IFF_DUPLICATED,
 			 * do not clear ND6_IFF_IFDISABLED.
 			 * See RFC 4862, Section 5.4.5.
 			 */
 			NET_EPOCH_ENTER(et);
 			CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 				if (ifa->ifa_addr->sa_family != AF_INET6)
 					continue;
 				ia = (struct in6_ifaddr *)ifa;
 				if ((ia->ia6_flags & IN6_IFF_DUPLICATED) &&
 				    IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
 					break;
 			}
 			NET_EPOCH_EXIT(et);
 
 			if (ifa != NULL) {
 				/* LLA is duplicated. */
 				ND.flags |= ND6_IFF_IFDISABLED;
 				log(LOG_ERR, "Cannot enable an interface"
 				    " with a link-local address marked"
 				    " duplicate.\n");
 			} else {
 				ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED;
 				if (ifp->if_flags & IFF_UP)
 					in6_if_up(ifp);
 			}
 		} else if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
 			    (ND.flags & ND6_IFF_IFDISABLED)) {
 			/* ifdisabled 0->1 transision */
 			/* Mark all IPv6 address as tentative. */
 
 			ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
 			if (V_ip6_dad_count > 0 &&
 			    (ND_IFINFO(ifp)->flags & ND6_IFF_NO_DAD) == 0) {
 				NET_EPOCH_ENTER(et);
 				CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead,
 				    ifa_link) {
 					if (ifa->ifa_addr->sa_family !=
 					    AF_INET6)
 						continue;
 					ia = (struct in6_ifaddr *)ifa;
 					ia->ia6_flags |= IN6_IFF_TENTATIVE;
 				}
 				NET_EPOCH_EXIT(et);
 			}
 		}
 
 		if (ND.flags & ND6_IFF_AUTO_LINKLOCAL) {
 			if (!(ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL)) {
 				/* auto_linklocal 0->1 transision */
 
 				/* If no link-local address on ifp, configure */
 				ND_IFINFO(ifp)->flags |= ND6_IFF_AUTO_LINKLOCAL;
 				in6_ifattach(ifp, NULL);
 			} else if (!(ND.flags & ND6_IFF_IFDISABLED) &&
 			    ifp->if_flags & IFF_UP) {
 				/*
 				 * When the IF already has
 				 * ND6_IFF_AUTO_LINKLOCAL, no link-local
 				 * address is assigned, and IFF_UP, try to
 				 * assign one.
 				 */
 				NET_EPOCH_ENTER(et);
 				CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead,
 				    ifa_link) {
 					if (ifa->ifa_addr->sa_family !=
 					    AF_INET6)
 						continue;
 					ia = (struct in6_ifaddr *)ifa;
 					if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
 						break;
 				}
 				NET_EPOCH_EXIT(et);
 				if (ifa != NULL)
 					/* No LLA is configured. */
 					in6_ifattach(ifp, NULL);
 			}
 		}
 		ND_IFINFO(ifp)->flags = ND.flags;
 		break;
 	}
 #undef ND
 	case SIOCSNDFLUSH_IN6:	/* XXX: the ioctl name is confusing... */
 		/* sync kernel routing table with the default router list */
 		defrouter_reset();
 		defrouter_select_fib(RT_ALL_FIBS);
 		break;
 	case SIOCSPFXFLUSH_IN6:
 	{
 		/* flush all the prefix advertised by routers */
 		struct in6_ifaddr *ia, *ia_next;
 		struct nd_prefix *pr, *next;
 		struct nd_prhead prl;
 
 		LIST_INIT(&prl);
 
 		ND6_WLOCK();
 		LIST_FOREACH_SAFE(pr, &V_nd_prefix, ndpr_entry, next) {
 			if (pr->ndpr_raf_ra_derived)
 				nd6_prefix_unlink(pr, &prl);
 		}
 		ND6_WUNLOCK();
 
 		while ((pr = LIST_FIRST(&prl)) != NULL) {
 			LIST_REMOVE(pr, ndpr_entry);
 			/* XXXRW: in6_ifaddrhead locking. */
 			CK_STAILQ_FOREACH_SAFE(ia, &V_in6_ifaddrhead, ia_link,
 			    ia_next) {
 				if ((ia->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 					continue;
 
 				if (ia->ia6_ndpr == pr)
 					in6_purgeaddr(&ia->ia_ifa);
 			}
 			nd6_prefix_del(pr);
 		}
 		break;
 	}
 	case SIOCSRTRFLUSH_IN6:
 	{
 		/* flush all the default routers */
 
 		defrouter_reset();
 		nd6_defrouter_flush_all();
 		defrouter_select_fib(RT_ALL_FIBS);
 		break;
 	}
 	case SIOCGNBRINFO_IN6:
 	{
 		struct llentry *ln;
 		struct in6_addr nb_addr = nbi->addr; /* make local for safety */
 
 		if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0)
 			return (error);
 
 		NET_EPOCH_ENTER(et);
 		ln = nd6_lookup(&nb_addr, LLE_SF(AF_INET6, 0), ifp);
 		NET_EPOCH_EXIT(et);
 
 		if (ln == NULL) {
 			error = EINVAL;
 			break;
 		}
 		nbi->state = ln->ln_state;
 		nbi->asked = ln->la_asked;
 		nbi->isrouter = ln->ln_router;
 		if (ln->la_expire == 0)
 			nbi->expire = 0;
 		else
 			nbi->expire = ln->la_expire + ln->lle_remtime / hz +
 			    (time_second - time_uptime);
 		LLE_RUNLOCK(ln);
 		break;
 	}
 	case SIOCGDEFIFACE_IN6:	/* XXX: should be implemented as a sysctl? */
 		ndif->ifindex = V_nd6_defifindex;
 		break;
 	case SIOCSDEFIFACE_IN6:	/* XXX: should be implemented as a sysctl? */
 		return (nd6_setdefaultiface(ndif->ifindex));
 	}
 	return (error);
 }
 
 /*
  * Calculates new isRouter value based on provided parameters and
  * returns it.
  */
 static int
 nd6_is_router(int type, int code, int is_new, int old_addr, int new_addr,
     int ln_router)
 {
 
 	/*
 	 * ICMP6 type dependent behavior.
 	 *
 	 * NS: clear IsRouter if new entry
 	 * RS: clear IsRouter
 	 * RA: set IsRouter if there's lladdr
 	 * redir: clear IsRouter if new entry
 	 *
 	 * RA case, (1):
 	 * The spec says that we must set IsRouter in the following cases:
 	 * - If lladdr exist, set IsRouter.  This means (1-5).
 	 * - If it is old entry (!newentry), set IsRouter.  This means (7).
 	 * So, based on the spec, in (1-5) and (7) cases we must set IsRouter.
 	 * A quetion arises for (1) case.  (1) case has no lladdr in the
 	 * neighbor cache, this is similar to (6).
 	 * This case is rare but we figured that we MUST NOT set IsRouter.
 	 *
 	 *   is_new  old_addr new_addr 	    NS  RS  RA	redir
 	 *							D R
 	 *	0	n	n	(1)	c   ?     s
 	 *	0	y	n	(2)	c   s     s
 	 *	0	n	y	(3)	c   s     s
 	 *	0	y	y	(4)	c   s     s
 	 *	0	y	y	(5)	c   s     s
 	 *	1	--	n	(6) c	c	c s
 	 *	1	--	y	(7) c	c   s	c s
 	 *
 	 *					(c=clear s=set)
 	 */
 	switch (type & 0xff) {
 	case ND_NEIGHBOR_SOLICIT:
 		/*
 		 * New entry must have is_router flag cleared.
 		 */
 		if (is_new)					/* (6-7) */
 			ln_router = 0;
 		break;
 	case ND_REDIRECT:
 		/*
 		 * If the icmp is a redirect to a better router, always set the
 		 * is_router flag.  Otherwise, if the entry is newly created,
 		 * clear the flag.  [RFC 2461, sec 8.3]
 		 */
 		if (code == ND_REDIRECT_ROUTER)
 			ln_router = 1;
 		else {
 			if (is_new)				/* (6-7) */
 				ln_router = 0;
 		}
 		break;
 	case ND_ROUTER_SOLICIT:
 		/*
 		 * is_router flag must always be cleared.
 		 */
 		ln_router = 0;
 		break;
 	case ND_ROUTER_ADVERT:
 		/*
 		 * Mark an entry with lladdr as a router.
 		 */
 		if ((!is_new && (old_addr || new_addr)) ||	/* (2-5) */
 		    (is_new && new_addr)) {			/* (7) */
 			ln_router = 1;
 		}
 		break;
 	}
 
 	return (ln_router);
 }
 
 /*
  * Create neighbor cache entry and cache link-layer address,
  * on reception of inbound ND6 packets.  (RS/RA/NS/redirect)
  *
  * type - ICMP6 type
  * code - type dependent information
  *
  */
 void
 nd6_cache_lladdr(struct ifnet *ifp, struct in6_addr *from, char *lladdr,
     int lladdrlen, int type, int code)
 {
 	struct llentry *ln = NULL, *ln_tmp;
 	int is_newentry;
 	int do_update;
 	int olladdr;
 	int llchange;
 	int flags;
 	uint16_t router = 0;
 	struct mbuf *chain = NULL;
 	u_char linkhdr[LLE_MAX_LINKHDR];
 	size_t linkhdrsize;
 	int lladdr_off;
 
 	NET_EPOCH_ASSERT();
 	IF_AFDATA_UNLOCK_ASSERT(ifp);
 
 	KASSERT(ifp != NULL, ("%s: ifp == NULL", __func__));
 	KASSERT(from != NULL, ("%s: from == NULL", __func__));
 
 	/* nothing must be updated for unspecified address */
 	if (IN6_IS_ADDR_UNSPECIFIED(from))
 		return;
 
 	/*
 	 * Validation about ifp->if_addrlen and lladdrlen must be done in
 	 * the caller.
 	 *
 	 * XXX If the link does not have link-layer adderss, what should
 	 * we do? (ifp->if_addrlen == 0)
 	 * Spec says nothing in sections for RA, RS and NA.  There's small
 	 * description on it in NS section (RFC 2461 7.2.3).
 	 */
 	flags = lladdr ? LLE_EXCLUSIVE : 0;
 	ln = nd6_lookup(from, LLE_SF(AF_INET6, flags), ifp);
 	is_newentry = 0;
 	if (ln == NULL) {
 		flags |= LLE_EXCLUSIVE;
 		ln = nd6_alloc(from, 0, ifp);
 		if (ln == NULL)
 			return;
 
 		/*
 		 * Since we already know all the data for the new entry,
 		 * fill it before insertion.
 		 */
 		if (lladdr != NULL) {
 			linkhdrsize = sizeof(linkhdr);
 			if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
 			    linkhdr, &linkhdrsize, &lladdr_off) != 0) {
 				lltable_free_entry(LLTABLE6(ifp), ln);
 				return;
 			}
 			lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize,
 			    lladdr_off);
 		}
 
 		IF_AFDATA_WLOCK(ifp);
 		LLE_WLOCK(ln);
 		/* Prefer any existing lle over newly-created one */
 		ln_tmp = nd6_lookup(from, LLE_SF(AF_INET6, LLE_EXCLUSIVE), ifp);
 		if (ln_tmp == NULL)
 			lltable_link_entry(LLTABLE6(ifp), ln);
 		IF_AFDATA_WUNLOCK(ifp);
 		if (ln_tmp == NULL) {
 			/* No existing lle, mark as new entry (6,7) */
 			is_newentry = 1;
 			if (lladdr != NULL) {	/* (7) */
 				nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 				EVENTHANDLER_INVOKE(lle_event, ln,
 				    LLENTRY_RESOLVED);
 			}
 		} else {
 			lltable_free_entry(LLTABLE6(ifp), ln);
 			ln = ln_tmp;
 			ln_tmp = NULL;
 		}
 	} 
 	/* do nothing if static ndp is set */
 	if ((ln->la_flags & LLE_STATIC)) {
 		if (flags & LLE_EXCLUSIVE)
 			LLE_WUNLOCK(ln);
 		else
 			LLE_RUNLOCK(ln);
 		return;
 	}
 
 	olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0;
 	if (olladdr && lladdr) {
 		llchange = bcmp(lladdr, ln->ll_addr,
 		    ifp->if_addrlen);
 	} else if (!olladdr && lladdr)
 		llchange = 1;
 	else
 		llchange = 0;
 
 	/*
 	 * newentry olladdr  lladdr  llchange	(*=record)
 	 *	0	n	n	--	(1)
 	 *	0	y	n	--	(2)
 	 *	0	n	y	y	(3) * STALE
 	 *	0	y	y	n	(4) *
 	 *	0	y	y	y	(5) * STALE
 	 *	1	--	n	--	(6)   NOSTATE(= PASSIVE)
 	 *	1	--	y	--	(7) * STALE
 	 */
 
 	do_update = 0;
 	if (is_newentry == 0 && llchange != 0) {
 		do_update = 1;	/* (3,5) */
 
 		/*
 		 * Record source link-layer address
 		 * XXX is it dependent to ifp->if_type?
 		 */
 		if (!nd6_try_set_entry_addr(ifp, ln, lladdr)) {
 			/* Entry was deleted */
 			LLE_WUNLOCK(ln);
 			return;
 		}
 
 		nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 
 		EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 
 		if (ln->la_hold != NULL)
 			chain = nd6_grab_holdchain(ln);
 	}
 
 	/* Calculates new router status */
 	router = nd6_is_router(type, code, is_newentry, olladdr,
 	    lladdr != NULL ? 1 : 0, ln->ln_router);
 
 	ln->ln_router = router;
 	/* Mark non-router redirects with special flag */
 	if ((type & 0xFF) == ND_REDIRECT && code != ND_REDIRECT_ROUTER)
 		ln->la_flags |= LLE_REDIRECT;
 
 	if (flags & LLE_EXCLUSIVE)
 		LLE_WUNLOCK(ln);
 	else
 		LLE_RUNLOCK(ln);
 
 	if (chain != NULL)
 		nd6_flush_holdchain(ifp, ln, chain);
 	if (do_update)
 		nd6_flush_children_holdchain(ifp, ln);
 
 	/*
 	 * When the link-layer address of a router changes, select the
 	 * best router again.  In particular, when the neighbor entry is newly
 	 * created, it might affect the selection policy.
 	 * Question: can we restrict the first condition to the "is_newentry"
 	 * case?
 	 * XXX: when we hear an RA from a new router with the link-layer
 	 * address option, defrouter_select_fib() is called twice, since
 	 * defrtrlist_update called the function as well.  However, I believe
 	 * we can compromise the overhead, since it only happens the first
 	 * time.
 	 * XXX: although defrouter_select_fib() should not have a bad effect
 	 * for those are not autoconfigured hosts, we explicitly avoid such
 	 * cases for safety.
 	 */
 	if ((do_update || is_newentry) && router &&
 	    ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV) {
 		/*
 		 * guaranteed recursion
 		 */
 		defrouter_select_fib(ifp->if_fib);
 	}
 }
 
 static void
 nd6_slowtimo(void *arg)
 {
 	struct epoch_tracker et;
 	CURVNET_SET((struct vnet *) arg);
 	struct nd_ifinfo *nd6if;
 	struct ifnet *ifp;
 
 	callout_reset(&V_nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
 	    nd6_slowtimo, curvnet);
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (ifp->if_afdata[AF_INET6] == NULL)
 			continue;
 		nd6if = ND_IFINFO(ifp);
 		if (nd6if->basereachable && /* already initialized */
 		    (nd6if->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) {
 			/*
 			 * Since reachable time rarely changes by router
 			 * advertisements, we SHOULD insure that a new random
 			 * value gets recomputed at least once every few hours.
 			 * (RFC 2461, 6.3.4)
 			 */
 			nd6if->recalctm = V_nd6_recalc_reachtm_interval;
 			nd6if->reachable = ND_COMPUTE_RTIME(nd6if->basereachable);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 struct mbuf *
 nd6_grab_holdchain(struct llentry *ln)
 {
 	struct mbuf *chain;
 
 	LLE_WLOCK_ASSERT(ln);
 
 	chain = ln->la_hold;
 	ln->la_hold = NULL;
 	ln->la_numheld = 0;
 
 	if (ln->ln_state == ND6_LLINFO_STALE) {
 		/*
 		 * The first time we send a packet to a
 		 * neighbor whose entry is STALE, we have
 		 * to change the state to DELAY and a sets
 		 * a timer to expire in DELAY_FIRST_PROBE_TIME
 		 * seconds to ensure do neighbor unreachability
 		 * detection on expiration.
 		 * (RFC 2461 7.3.3)
 		 */
 		nd6_llinfo_setstate(ln, ND6_LLINFO_DELAY);
 	}
 
 	return (chain);
 }
 
 int
 nd6_output_ifp(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *m,
     struct sockaddr_in6 *dst, struct route *ro)
 {
 	int error;
 	int ip6len;
 	struct ip6_hdr *ip6;
 	struct m_tag *mtag;
 
 #ifdef MAC
 	mac_netinet6_nd6_send(ifp, m);
 #endif
 
 	/*
 	 * If called from nd6_ns_output() (NS), nd6_na_output() (NA),
 	 * icmp6_redirect_output() (REDIRECT) or from rip6_output() (RS, RA
 	 * as handled by rtsol and rtadvd), mbufs will be tagged for SeND
 	 * to be diverted to user space.  When re-injected into the kernel,
 	 * send_output() will directly dispatch them to the outgoing interface.
 	 */
 	if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_find(m, PACKET_TAG_ND_OUTGOING, NULL);
 		if (mtag != NULL) {
 			ip6 = mtod(m, struct ip6_hdr *);
 			ip6len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen);
 			/* Use the SEND socket */
 			error = send_sendso_input_hook(m, ifp, SND_OUT,
 			    ip6len);
 			/* -1 == no app on SEND socket */
 			if (error == 0 || error != -1)
 			    return (error);
 		}
 	}
 
 	m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 	IP_PROBE(send, NULL, NULL, mtod(m, struct ip6_hdr *), ifp, NULL,
 	    mtod(m, struct ip6_hdr *));
 
 	if ((ifp->if_flags & IFF_LOOPBACK) == 0)
 		origifp = ifp;
 
 	error = (*ifp->if_output)(origifp, m, (struct sockaddr *)dst, ro);
 	return (error);
 }
 
 /*
  * Lookup link headerfor @sa_dst address. Stores found
  * data in @desten buffer. Copy of lle ln_flags can be also
  * saved in @pflags if @pflags is non-NULL.
  *
  * If destination LLE does not exists or lle state modification
  * is required, call "slow" version.
  *
  * Return values:
  * - 0 on success (address copied to buffer).
  * - EWOULDBLOCK (no local error, but address is still unresolved)
  * - other errors (alloc failure, etc)
  */
 int
 nd6_resolve(struct ifnet *ifp, int gw_flags, struct mbuf *m,
     const struct sockaddr *sa_dst, u_char *desten, uint32_t *pflags,
     struct llentry **plle)
 {
 	struct llentry *ln = NULL;
 	const struct sockaddr_in6 *dst6;
 
 	NET_EPOCH_ASSERT();
 
 	if (pflags != NULL)
 		*pflags = 0;
 
 	dst6 = (const struct sockaddr_in6 *)sa_dst;
 
 	/* discard the packet if IPv6 operation is disabled on the interface */
 	if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
 		m_freem(m);
 		return (ENETDOWN); /* better error? */
 	}
 
 	if (m != NULL && m->m_flags & M_MCAST) {
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 		case IFT_L2VLAN:
 		case IFT_BRIDGE:
 			ETHER_MAP_IPV6_MULTICAST(&dst6->sin6_addr,
 						 desten);
 			return (0);
 		default:
 			m_freem(m);
 			return (EAFNOSUPPORT);
 		}
 	}
 
 	int family = gw_flags >> 16;
 	int lookup_flags = plle ? LLE_EXCLUSIVE : LLE_UNLOCKED;
 	ln = nd6_lookup(&dst6->sin6_addr, LLE_SF(family, lookup_flags), ifp);
 	if (ln != NULL && (ln->r_flags & RLLE_VALID) != 0) {
 		/* Entry found, let's copy lle info */
 		bcopy(ln->r_linkdata, desten, ln->r_hdrlen);
 		if (pflags != NULL)
 			*pflags = LLE_VALID | (ln->r_flags & RLLE_IFADDR);
 		llentry_provide_feedback(ln);
 		if (plle) {
 			LLE_ADDREF(ln);
 			*plle = ln;
 			LLE_WUNLOCK(ln);
 		}
 		return (0);
 	} else if (plle && ln)
 		LLE_WUNLOCK(ln);
 
 	return (nd6_resolve_slow(ifp, family, 0, m, dst6, desten, pflags, plle));
 }
 
 /*
  * Finds or creates a new llentry for @addr and @family.
  * Returns wlocked llentry or NULL.
  *
  *
  * Child LLEs.
  *
  * Do not have their own state machine (gets marked as static)
  *  settimer bails out for child LLEs just in case.
  *
  * Locking order: parent lle gets locked first, chen goes the child.
  */
 static __noinline struct llentry *
 nd6_get_llentry(struct ifnet *ifp, const struct in6_addr *addr, int family)
 {
 	struct llentry *child_lle = NULL;
 	struct llentry *lle, *lle_tmp;
 
 	lle = nd6_alloc(addr, 0, ifp);
 	if (lle != NULL && family != AF_INET6) {
 		child_lle = nd6_alloc(addr, 0, ifp);
 		if (child_lle == NULL) {
 			lltable_free_entry(LLTABLE6(ifp), lle);
 			return (NULL);
 		}
 		child_lle->r_family = family;
 		child_lle->la_flags |= LLE_CHILD | LLE_STATIC;
 		child_lle->ln_state = ND6_LLINFO_INCOMPLETE;
 	}
 
 	if (lle == NULL) {
 		char ip6buf[INET6_ADDRSTRLEN];
 		log(LOG_DEBUG,
 		    "nd6_get_llentry: can't allocate llinfo for %s "
 		    "(ln=%p)\n",
 		    ip6_sprintf(ip6buf, addr), lle);
 		return (NULL);
 	}
 
 	IF_AFDATA_WLOCK(ifp);
 	LLE_WLOCK(lle);
 	/* Prefer any existing entry over newly-created one */
 	lle_tmp = nd6_lookup(addr, LLE_SF(AF_INET6, LLE_EXCLUSIVE), ifp);
 	if (lle_tmp == NULL)
 		lltable_link_entry(LLTABLE6(ifp), lle);
 	else {
 		lltable_free_entry(LLTABLE6(ifp), lle);
 		lle = lle_tmp;
 	}
 	if (child_lle != NULL) {
 		/* Check if child lle for the same family exists */
 		lle_tmp = llentry_lookup_family(lle, child_lle->r_family);
 		LLE_WLOCK(child_lle);
 		if (lle_tmp == NULL) {
 			/* Attach */
 			lltable_link_child_entry(lle, child_lle);
 		} else {
 			/* child lle already exists, free newly-created one */
 			lltable_free_entry(LLTABLE6(ifp), child_lle);
 			child_lle = lle_tmp;
 		}
 		LLE_WUNLOCK(lle);
 		lle = child_lle;
 	}
 	IF_AFDATA_WUNLOCK(ifp);
 	return (lle);
 }
 
 /*
  * Do L2 address resolution for @sa_dst address. Stores found
  * address in @desten buffer. Copy of lle ln_flags can be also
  * saved in @pflags if @pflags is non-NULL.
  *
  * Heavy version.
  * Function assume that destination LLE does not exist,
  * is invalid or stale, so LLE_EXCLUSIVE lock needs to be acquired.
  *
  * Set noinline to be dtrace-friendly
  */
 static __noinline int
 nd6_resolve_slow(struct ifnet *ifp, int family, int flags, struct mbuf *m,
     const struct sockaddr_in6 *dst, u_char *desten, uint32_t *pflags,
     struct llentry **plle)
 {
 	struct llentry *lle = NULL;
 	struct in6_addr *psrc, src;
 	int send_ns, ll_len;
 	char *lladdr;
 
 	NET_EPOCH_ASSERT();
 
 	/*
 	 * Address resolution or Neighbor Unreachability Detection
 	 * for the next hop.
 	 * At this point, the destination of the packet must be a unicast
 	 * or an anycast address(i.e. not a multicast).
 	 */
 	lle = nd6_lookup(&dst->sin6_addr, LLE_SF(family, LLE_EXCLUSIVE), ifp);
 	if ((lle == NULL) && nd6_is_addr_neighbor(dst, ifp))  {
 		/*
 		 * Since nd6_is_addr_neighbor() internally calls nd6_lookup(),
 		 * the condition below is not very efficient.  But we believe
 		 * it is tolerable, because this should be a rare case.
 		 */
 		lle = nd6_get_llentry(ifp, &dst->sin6_addr, family);
 	}
 
 	if (lle == NULL) {
 		m_freem(m);
 		return (ENOBUFS);
 	}
 
 	LLE_WLOCK_ASSERT(lle);
 
 	/*
 	 * The first time we send a packet to a neighbor whose entry is
 	 * STALE, we have to change the state to DELAY and a sets a timer to
 	 * expire in DELAY_FIRST_PROBE_TIME seconds to ensure do
 	 * neighbor unreachability detection on expiration.
 	 * (RFC 2461 7.3.3)
 	 */
 	if ((!(lle->la_flags & LLE_CHILD)) && (lle->ln_state == ND6_LLINFO_STALE))
 		nd6_llinfo_setstate(lle, ND6_LLINFO_DELAY);
 
 	/*
 	 * If the neighbor cache entry has a state other than INCOMPLETE
 	 * (i.e. its link-layer address is already resolved), just
 	 * send the packet.
 	 */
 	if (lle->ln_state > ND6_LLINFO_INCOMPLETE) {
 		if (flags & LLE_ADDRONLY) {
 			lladdr = lle->ll_addr;
 			ll_len = ifp->if_addrlen;
 		} else {
 			lladdr = lle->r_linkdata;
 			ll_len = lle->r_hdrlen;
 		}
 		bcopy(lladdr, desten, ll_len);
 		if (pflags != NULL)
 			*pflags = lle->la_flags;
 		if (plle) {
 			LLE_ADDREF(lle);
 			*plle = lle;
 		}
 		LLE_WUNLOCK(lle);
 		return (0);
 	}
 
 	/*
 	 * There is a neighbor cache entry, but no ethernet address
 	 * response yet.  Append this latest packet to the end of the
 	 * packet queue in the mbuf.  When it exceeds nd6_maxqueuelen,
 	 * the oldest packet in the queue will be removed.
 	 */
 	if (m != NULL) {
 		size_t dropped;
 
 		dropped = lltable_append_entry_queue(lle, m, V_nd6_maxqueuelen);
 		ICMP6STAT_ADD(icp6s_dropped, dropped);
 	}
 
 	/*
 	 * If there has been no NS for the neighbor after entering the
 	 * INCOMPLETE state, send the first solicitation.
 	 * Note that for newly-created lle la_asked will be 0,
 	 * so we will transition from ND6_LLINFO_NOSTATE to
 	 * ND6_LLINFO_INCOMPLETE state here.
 	 */
 	psrc = NULL;
 	send_ns = 0;
 
 	/* If we have child lle, switch to the parent to send NS */
 	if (lle->la_flags & LLE_CHILD) {
 		struct llentry *lle_parent = lle->lle_parent;
 		LLE_WUNLOCK(lle);
 		lle = lle_parent;
 		LLE_WLOCK(lle);
 	}
 	if (lle->la_asked == 0) {
 		lle->la_asked++;
 		send_ns = 1;
 		psrc = nd6_llinfo_get_holdsrc(lle, &src);
 
 		nd6_llinfo_setstate(lle, ND6_LLINFO_INCOMPLETE);
 	}
 	LLE_WUNLOCK(lle);
 	if (send_ns != 0)
 		nd6_ns_output(ifp, psrc, NULL, &dst->sin6_addr, NULL);
 
 	return (EWOULDBLOCK);
 }
 
 /*
  * Do L2 address resolution for @sa_dst address. Stores found
  * address in @desten buffer. Copy of lle ln_flags can be also
  * saved in @pflags if @pflags is non-NULL.
  *
  * Return values:
  * - 0 on success (address copied to buffer).
  * - EWOULDBLOCK (no local error, but address is still unresolved)
  * - other errors (alloc failure, etc)
  */
 int
 nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst,
     char *desten, uint32_t *pflags)
 {
 	int error;
 
 	flags |= LLE_ADDRONLY;
 	error = nd6_resolve_slow(ifp, AF_INET6, flags, NULL,
 	    (const struct sockaddr_in6 *)dst, desten, pflags, NULL);
 	return (error);
 }
 
 int
 nd6_flush_holdchain(struct ifnet *ifp, struct llentry *lle, struct mbuf *chain)
 {
 	struct mbuf *m, *m_head;
 	struct sockaddr_in6 dst6;
 	int error = 0;
 
 	NET_EPOCH_ASSERT();
 
 	struct route_in6 ro = {
 		.ro_prepend = lle->r_linkdata,
 		.ro_plen = lle->r_hdrlen,
 	};
 
 	lltable_fill_sa_entry(lle, (struct sockaddr *)&dst6);
 	m_head = chain;
 
 	while (m_head) {
 		m = m_head;
 		m_head = m_head->m_nextpkt;
 		m->m_nextpkt = NULL;
 		error = nd6_output_ifp(ifp, ifp, m, &dst6, (struct route *)&ro);
 	}
 
 	/*
 	 * XXX
 	 * note that intermediate errors are blindly ignored
 	 */
 	return (error);
 }
 
 __noinline void
 nd6_flush_children_holdchain(struct ifnet *ifp, struct llentry *lle)
 {
 	struct llentry *child_lle;
 	struct mbuf *chain;
 
 	NET_EPOCH_ASSERT();
 
 	CK_SLIST_FOREACH(child_lle, &lle->lle_children, lle_child_next) {
 		LLE_WLOCK(child_lle);
 		chain = nd6_grab_holdchain(child_lle);
 		LLE_WUNLOCK(child_lle);
 		nd6_flush_holdchain(ifp, child_lle, chain);
 	}
 }
 
 static int
 nd6_need_cache(struct ifnet *ifp)
 {
 	/*
 	 * XXX: we currently do not make neighbor cache on any interface
 	 * other than Ethernet and GIF.
 	 *
 	 * RFC2893 says:
 	 * - unidirectional tunnels needs no ND
 	 */
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_IEEE1394:
 	case IFT_L2VLAN:
 	case IFT_INFINIBAND:
 	case IFT_BRIDGE:
 	case IFT_PROPVIRTUAL:
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 /*
  * Add pernament ND6 link-layer record for given
  * interface address.
  *
  * Very similar to IPv4 arp_ifinit(), but:
  * 1) IPv6 DAD is performed in different place
  * 2) It is called by IPv6 protocol stack in contrast to
  * arp_ifinit() which is typically called in SIOCSIFADDR
  * driver ioctl handler.
  *
  */
 int
 nd6_add_ifa_lle(struct in6_ifaddr *ia)
 {
 	struct ifnet *ifp;
 	struct llentry *ln, *ln_tmp;
 	struct sockaddr *dst;
 
 	ifp = ia->ia_ifa.ifa_ifp;
 	if (nd6_need_cache(ifp) == 0)
 		return (0);
 
 	dst = (struct sockaddr *)&ia->ia_addr;
 	ln = lltable_alloc_entry(LLTABLE6(ifp), LLE_IFADDR, dst);
 	if (ln == NULL)
 		return (ENOBUFS);
 
 	IF_AFDATA_WLOCK(ifp);
 	LLE_WLOCK(ln);
 	/* Unlink any entry if exists */
 	ln_tmp = lla_lookup(LLTABLE6(ifp), LLE_SF(AF_INET6, LLE_EXCLUSIVE), dst);
 	if (ln_tmp != NULL)
 		lltable_unlink_entry(LLTABLE6(ifp), ln_tmp);
 	lltable_link_entry(LLTABLE6(ifp), ln);
 	IF_AFDATA_WUNLOCK(ifp);
 
 	if (ln_tmp != NULL)
 		EVENTHANDLER_INVOKE(lle_event, ln_tmp, LLENTRY_EXPIRED);
 	EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 
 	LLE_WUNLOCK(ln);
 	if (ln_tmp != NULL)
 		llentry_free(ln_tmp);
 
 	return (0);
 }
 
 /*
  * Removes either all lle entries for given @ia, or lle
  * corresponding to @ia address.
  */
 void
 nd6_rem_ifa_lle(struct in6_ifaddr *ia, int all)
 {
 	struct sockaddr_in6 mask, addr;
 	struct sockaddr *saddr, *smask;
 	struct ifnet *ifp;
 
 	ifp = ia->ia_ifa.ifa_ifp;
 	memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr));
 	memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask));
 	saddr = (struct sockaddr *)&addr;
 	smask = (struct sockaddr *)&mask;
 
 	if (all != 0)
 		lltable_prefix_free(AF_INET6, saddr, smask, LLE_STATIC);
 	else
 		lltable_delete_addr(LLTABLE6(ifp), LLE_IFADDR, saddr);
 }
 
 static int
 nd6_sysctl_prlist(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_prefix p;
 	struct sockaddr_in6 s6;
 	struct nd_prefix *pr;
 	struct nd_pfxrouter *pfr;
 	time_t maxexpire;
 	int error;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	if (req->newptr)
 		return (EPERM);
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	bzero(&p, sizeof(p));
 	p.origin = PR_ORIG_RA;
 	bzero(&s6, sizeof(s6));
 	s6.sin6_family = AF_INET6;
 	s6.sin6_len = sizeof(s6);
 
 	ND6_RLOCK();
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		if (!pr->ndpr_raf_ra_derived)
 			continue;
 		p.prefix = pr->ndpr_prefix;
 		if (sa6_recoverscope(&p.prefix)) {
 			log(LOG_ERR, "scope error in prefix list (%s)\n",
 			    ip6_sprintf(ip6buf, &p.prefix.sin6_addr));
 			/* XXX: press on... */
 		}
 		p.raflags = pr->ndpr_raf;
 		p.prefixlen = pr->ndpr_plen;
 		p.vltime = pr->ndpr_vltime;
 		p.pltime = pr->ndpr_pltime;
 		p.if_index = pr->ndpr_ifp->if_index;
 		if (pr->ndpr_vltime == ND6_INFINITE_LIFETIME)
 			p.expire = 0;
 		else {
 			/* XXX: we assume time_t is signed. */
 			maxexpire = (-1) &
 			    ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1));
 			if (pr->ndpr_vltime < maxexpire - pr->ndpr_lastupdate)
 				p.expire = pr->ndpr_lastupdate +
 				    pr->ndpr_vltime +
 				    (time_second - time_uptime);
 			else
 				p.expire = maxexpire;
 		}
 		p.refcnt = pr->ndpr_addrcnt;
 		p.flags = pr->ndpr_stateflags;
 		p.advrtrs = 0;
 		LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry)
 			p.advrtrs++;
 		error = SYSCTL_OUT(req, &p, sizeof(p));
 		if (error != 0)
 			break;
 		LIST_FOREACH(pfr, &pr->ndpr_advrtrs, pfr_entry) {
 			s6.sin6_addr = pfr->router->rtaddr;
 			if (sa6_recoverscope(&s6))
 				log(LOG_ERR,
 				    "scope error in prefix list (%s)\n",
 				    ip6_sprintf(ip6buf, &pfr->router->rtaddr));
 			error = SYSCTL_OUT(req, &s6, sizeof(s6));
 			if (error != 0)
 				goto out;
 		}
 	}
 out:
 	ND6_RUNLOCK();
 	return (error);
 }
 SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_PRLIST, nd6_prlist,
 	CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	NULL, 0, nd6_sysctl_prlist, "S,in6_prefix",
 	"NDP prefix list");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXQLEN, nd6_maxqueuelen,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxqueuelen), 1, "");
 SYSCTL_INT(_net_inet6_icmp6, OID_AUTO, nd6_gctimer,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_gctimer), (60 * 60 * 24), "");
diff --git a/sys/netinet6/nd6.h b/sys/netinet6/nd6.h
index d653a432dbe4..76e91905bf5e 100644
--- a/sys/netinet6/nd6.h
+++ b/sys/netinet6/nd6.h
@@ -1,437 +1,437 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6.h,v 1.76 2001/12/18 02:10:31 itojun Exp $
  * $FreeBSD$
  */
 
 #ifndef _NETINET6_ND6_H_
 #define _NETINET6_ND6_H_
 
 /* see net/route.h, or net/if_inarp.h */
 #ifndef RTF_ANNOUNCE
 #define RTF_ANNOUNCE	RTF_PROTO2
 #endif
 
 #include <sys/queue.h>
 #include <sys/callout.h>
 
 struct llentry;
 
 #define ND6_LLINFO_NOSTATE	-2
 /*
  * We don't need the WAITDELETE state any more, but we keep the definition
  * in a comment line instead of removing it. This is necessary to avoid
  * unintentionally reusing the value for another purpose, which might
  * affect backward compatibility with old applications.
  * (20000711 jinmei@kame.net)
  */
 /* #define ND6_LLINFO_WAITDELETE	-1 */
 #define ND6_LLINFO_INCOMPLETE	0
 #define ND6_LLINFO_REACHABLE	1
 #define ND6_LLINFO_STALE	2
 #define ND6_LLINFO_DELAY	3
 #define ND6_LLINFO_PROBE	4
 
 #define ND6_IS_LLINFO_PROBREACH(n) ((n)->ln_state > ND6_LLINFO_INCOMPLETE)
 #define ND6_LLINFO_PERMANENT(n) (((n)->la_expire == 0) && ((n)->ln_state > ND6_LLINFO_INCOMPLETE))
 
 struct nd_ifinfo {
 	u_int32_t linkmtu;		/* LinkMTU */
 	u_int32_t maxmtu;		/* Upper bound of LinkMTU */
 	u_int32_t basereachable;	/* BaseReachableTime */
 	u_int32_t reachable;		/* Reachable Time */
 	u_int32_t retrans;		/* Retrans Timer */
 	u_int32_t flags;		/* Flags */
 	int recalctm;			/* BaseReacable re-calculation timer */
 	u_int8_t chlim;			/* CurHopLimit */
 	u_int8_t initialized; /* Flag to see the entry is initialized */
 	/* the following 3 members are for privacy extension for addrconf */
 	u_int8_t randomseed0[8]; /* upper 64 bits of MD5 digest */
 	u_int8_t randomseed1[8]; /* lower 64 bits (usually the EUI64 IFID) */
 	u_int8_t randomid[8];	/* current random ID */
 };
 
 #define ND6_IFF_PERFORMNUD	0x1
 #define ND6_IFF_ACCEPT_RTADV	0x2
 #define ND6_IFF_PREFER_SOURCE	0x4 /* Not used in FreeBSD. */
 #define ND6_IFF_IFDISABLED	0x8 /* IPv6 operation is disabled due to
 				     * DAD failure.  (XXX: not ND-specific)
 				     */
 #define ND6_IFF_DONT_SET_IFROUTE	0x10
 #define ND6_IFF_AUTO_LINKLOCAL	0x20
 #define	ND6_IFF_NO_RADR		0x40
 #define ND6_IFF_NO_PREFER_IFACE	0x80 /* XXX: not related to ND. */
 #define ND6_IFF_NO_DAD		0x100
 #ifdef EXPERIMENTAL
 /* XXX: not related to ND. */
 #define	ND6_IFF_IPV6_ONLY	0x200 /* draft-ietf-6man-ipv6only-flag */
 #define	ND6_IFF_IPV6_ONLY_MANUAL	0x400
 #define	ND6_IFF_IPV6_ONLY_MASK	(ND6_IFF_IPV6_ONLY|ND6_IFF_IPV6_ONLY_MANUAL)
 #endif
 
 #ifdef _KERNEL
 #define ND_IFINFO(ifp) \
-	(((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->nd_ifinfo)
+	(((struct in6_ifextra *)if_getafdata(ifp, AF_INET6))->nd_ifinfo)
 #define IN6_LINKMTU(ifp) \
 	((ND_IFINFO(ifp)->linkmtu && ND_IFINFO(ifp)->linkmtu < (ifp)->if_mtu) \
 	    ? ND_IFINFO(ifp)->linkmtu \
 	    : ((ND_IFINFO(ifp)->maxmtu && ND_IFINFO(ifp)->maxmtu < (ifp)->if_mtu) \
 		? ND_IFINFO(ifp)->maxmtu : (ifp)->if_mtu))
 #endif
 
 struct in6_nbrinfo {
 	char ifname[IFNAMSIZ];	/* if name, e.g. "en0" */
 	struct in6_addr addr;	/* IPv6 address of the neighbor */
 	long	asked;		/* number of queries already sent for this addr */
 	int	isrouter;	/* if it acts as a router */
 	int	state;		/* reachability state */
 	int	expire;		/* lifetime for NDP state transition */
 };
 
 /* Sysctls, shared with user space. */
 struct	in6_defrouter {
 	struct	sockaddr_in6 rtaddr;
 	u_char	flags;
 	u_short	rtlifetime;
 	u_long	expire;
 	u_short if_index;
 };
 
 struct in6_prefix {
 	struct	sockaddr_in6 prefix;
 	struct prf_ra raflags;
 	u_char	prefixlen;
 	u_char	origin;
 	u_int32_t vltime;
 	u_int32_t pltime;
 	time_t expire;
 	u_int32_t flags;
 	int refcnt;
 	u_short if_index;
 	u_short advrtrs; /* number of advertisement routers */
 	/* struct sockaddr_in6 advrtr[] */
 };
 
 #ifdef _KERNEL
 struct	in6_ondireq {
 	char ifname[IFNAMSIZ];
 	struct {
 		u_int32_t linkmtu;	/* LinkMTU */
 		u_int32_t maxmtu;	/* Upper bound of LinkMTU */
 		u_int32_t basereachable; /* BaseReachableTime */
 		u_int32_t reachable;	/* Reachable Time */
 		u_int32_t retrans;	/* Retrans Timer */
 		u_int32_t flags;	/* Flags */
 		int recalctm;		/* BaseReacable re-calculation timer */
 		u_int8_t chlim;		/* CurHopLimit */
 		u_int8_t receivedra;
 	} ndi;
 };
 #endif
 
 struct	in6_ndireq {
 	char ifname[IFNAMSIZ];
 	struct nd_ifinfo ndi;
 };
 
 struct	in6_ndifreq {
 	char ifname[IFNAMSIZ];
 	u_long ifindex;
 };
 
 /* Prefix status */
 #define NDPRF_ONLINK		0x1
 #define NDPRF_DETACHED		0x2
 
 /* protocol constants */
 #define MAX_RTR_SOLICITATION_DELAY	1	/* 1sec */
 #define RTR_SOLICITATION_INTERVAL	4	/* 4sec */
 #define MAX_RTR_SOLICITATIONS		3
 
 #define ND6_INFINITE_LIFETIME		0xffffffff
 
 #ifdef _KERNEL
 /* node constants */
 #define MAX_REACHABLE_TIME		3600000	/* msec */
 #define REACHABLE_TIME			30000	/* msec */
 #define RETRANS_TIMER			1000	/* msec */
 #define MIN_RANDOM_FACTOR		512	/* 1024 * 0.5 */
 #define MAX_RANDOM_FACTOR		1536	/* 1024 * 1.5 */
 #define DEF_TEMP_VALID_LIFETIME		604800	/* 1 week */
 #define DEF_TEMP_PREFERRED_LIFETIME	86400	/* 1 day */
 #define TEMPADDR_REGEN_ADVANCE		5	/* sec */
 #define MAX_TEMP_DESYNC_FACTOR		600	/* 10 min */
 #define ND_COMPUTE_RTIME(x) \
 		(((MIN_RANDOM_FACTOR * (x >> 10)) + (arc4random() & \
 		((MAX_RANDOM_FACTOR - MIN_RANDOM_FACTOR) * (x >> 10)))) /1000)
 
 struct nd_defrouter {
 	TAILQ_ENTRY(nd_defrouter) dr_entry;
 	struct in6_addr rtaddr;
 	u_char	raflags;	/* flags on RA message */
 	u_short	rtlifetime;
 	u_long	expire;
 	struct ifnet *ifp;
 	int	installed;	/* is installed into kernel routing table */
 	u_int	refcnt;
 };
 
 struct nd_prefixctl {
 	struct ifnet *ndpr_ifp;
 
 	/* prefix */
 	struct sockaddr_in6 ndpr_prefix;
 	u_char	ndpr_plen;
 
 	u_int32_t ndpr_vltime;	/* advertised valid lifetime */
 	u_int32_t ndpr_pltime;	/* advertised preferred lifetime */
 
 	struct prf_ra ndpr_flags;
 };
 
 LIST_HEAD(nd_prhead, nd_prefix);
 struct nd_prefix {
 	struct ifnet *ndpr_ifp;
 	LIST_ENTRY(nd_prefix) ndpr_entry;
 	struct sockaddr_in6 ndpr_prefix;	/* prefix */
 	struct in6_addr ndpr_mask; /* netmask derived from the prefix */
 
 	u_int32_t ndpr_vltime;	/* advertised valid lifetime */
 	u_int32_t ndpr_pltime;	/* advertised preferred lifetime */
 
 	time_t ndpr_expire;	/* expiration time of the prefix */
 	time_t ndpr_preferred;	/* preferred time of the prefix */
 	time_t ndpr_lastupdate; /* reception time of last advertisement */
 
 	struct prf_ra ndpr_flags;
 	u_int32_t ndpr_stateflags; /* actual state flags */
 	/* list of routers that advertise the prefix: */
 	LIST_HEAD(pr_rtrhead, nd_pfxrouter) ndpr_advrtrs;
 	u_char	ndpr_plen;
 	int	ndpr_addrcnt;	/* count of derived addresses */
 	volatile u_int ndpr_refcnt;
 };
 
 #define ndpr_raf		ndpr_flags
 #define ndpr_raf_onlink		ndpr_flags.onlink
 #define ndpr_raf_auto		ndpr_flags.autonomous
 #define ndpr_raf_ra_derived	ndpr_flags.ra_derived
 #define ndpr_raf_router		ndpr_flags.router
 
 struct nd_pfxrouter {
 	LIST_ENTRY(nd_pfxrouter) pfr_entry;
 	struct nd_defrouter *router;
 };
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_IP6NDP);
 #endif
 
 /* nd6.c */
 VNET_DECLARE(int, nd6_prune);
 VNET_DECLARE(int, nd6_delay);
 VNET_DECLARE(int, nd6_umaxtries);
 VNET_DECLARE(int, nd6_mmaxtries);
 VNET_DECLARE(int, nd6_useloopback);
 VNET_DECLARE(int, nd6_maxnudhint);
 VNET_DECLARE(int, nd6_gctimer);
 VNET_DECLARE(struct nd_prhead, nd_prefix);
 VNET_DECLARE(int, nd6_debug);
 VNET_DECLARE(int, nd6_onlink_ns_rfc4861);
 #define	V_nd6_prune			VNET(nd6_prune)
 #define	V_nd6_delay			VNET(nd6_delay)
 #define	V_nd6_umaxtries			VNET(nd6_umaxtries)
 #define	V_nd6_mmaxtries			VNET(nd6_mmaxtries)
 #define	V_nd6_useloopback		VNET(nd6_useloopback)
 #define	V_nd6_maxnudhint		VNET(nd6_maxnudhint)
 #define	V_nd6_gctimer			VNET(nd6_gctimer)
 #define	V_nd_prefix			VNET(nd_prefix)
 #define	V_nd6_debug			VNET(nd6_debug)
 #define	V_nd6_onlink_ns_rfc4861		VNET(nd6_onlink_ns_rfc4861)
 
 /* Lock for the prefix and default router lists. */
 VNET_DECLARE(struct rwlock, nd6_lock);
 VNET_DECLARE(uint64_t, nd6_list_genid);
 #define	V_nd6_lock			VNET(nd6_lock)
 #define	V_nd6_list_genid		VNET(nd6_list_genid)
 
 #define	ND6_RLOCK()			rw_rlock(&V_nd6_lock)
 #define	ND6_RUNLOCK()			rw_runlock(&V_nd6_lock)
 #define	ND6_WLOCK()			rw_wlock(&V_nd6_lock)
 #define	ND6_WUNLOCK()			rw_wunlock(&V_nd6_lock)
 #define	ND6_TRY_UPGRADE()		rw_try_upgrade(&V_nd6_lock)
 #define	ND6_WLOCK_ASSERT()		rw_assert(&V_nd6_lock, RA_WLOCKED)
 #define	ND6_RLOCK_ASSERT()		rw_assert(&V_nd6_lock, RA_RLOCKED)
 #define	ND6_LOCK_ASSERT()		rw_assert(&V_nd6_lock, RA_LOCKED)
 #define	ND6_UNLOCK_ASSERT()		rw_assert(&V_nd6_lock, RA_UNLOCKED)
 
 /* Mutex for prefix onlink/offlink transitions. */
 VNET_DECLARE(struct mtx, nd6_onlink_mtx);
 #define	V_nd6_onlink_mtx		VNET(nd6_onlink_mtx)
 
 #define	ND6_ONLINK_LOCK()		mtx_lock(&V_nd6_onlink_mtx)
 #define	ND6_ONLINK_TRYLOCK()		mtx_trylock(&V_nd6_onlink_mtx)
 #define	ND6_ONLINK_UNLOCK()		mtx_unlock(&V_nd6_onlink_mtx)
 #define	ND6_ONLINK_LOCK_ASSERT()	mtx_assert(&V_nd6_onlink_mtx, MA_OWNED)
 #define	ND6_ONLINK_UNLOCK_ASSERT()	mtx_assert(&V_nd6_onlink_mtx, MA_NOTOWNED)
 
 #define nd6log(x)	do { if (V_nd6_debug) log x; } while (/*CONSTCOND*/ 0)
 
 /* nd6_rtr.c */
 VNET_DECLARE(int, nd6_defifindex);
 VNET_DECLARE(int, ip6_desync_factor);	/* seconds */
 VNET_DECLARE(u_int32_t, ip6_temp_preferred_lifetime); /* seconds */
 VNET_DECLARE(u_int32_t, ip6_temp_valid_lifetime); /* seconds */
 VNET_DECLARE(int, ip6_temp_regen_advance); /* seconds */
 #define	V_nd6_defifindex		VNET(nd6_defifindex)
 #define	V_ip6_desync_factor		VNET(ip6_desync_factor)
 #define	V_ip6_temp_preferred_lifetime	VNET(ip6_temp_preferred_lifetime)
 #define	V_ip6_temp_valid_lifetime	VNET(ip6_temp_valid_lifetime)
 #define	V_ip6_temp_regen_advance	VNET(ip6_temp_regen_advance)
 
 union nd_opts {
 	struct nd_opt_hdr *nd_opt_array[16];	/* max = ND_OPT_NONCE */
 	struct {
 		struct nd_opt_hdr *zero;
 		struct nd_opt_hdr *src_lladdr;
 		struct nd_opt_hdr *tgt_lladdr;
 		struct nd_opt_prefix_info *pi_beg; /* multiple opts, start */
 		struct nd_opt_rd_hdr *rh;
 		struct nd_opt_mtu *mtu;
 		struct nd_opt_hdr *__res6;
 		struct nd_opt_hdr *__res7;
 		struct nd_opt_hdr *__res8;
 		struct nd_opt_hdr *__res9;
 		struct nd_opt_hdr *__res10;
 		struct nd_opt_hdr *__res11;
 		struct nd_opt_hdr *__res12;
 		struct nd_opt_hdr *__res13;
 		struct nd_opt_nonce *nonce;
 		struct nd_opt_hdr *__res15;
 		struct nd_opt_hdr *search;	/* multiple opts */
 		struct nd_opt_hdr *last;	/* multiple opts */
 		int done;
 		struct nd_opt_prefix_info *pi_end;/* multiple opts, end */
 	} nd_opt_each;
 };
 #define nd_opts_src_lladdr	nd_opt_each.src_lladdr
 #define nd_opts_tgt_lladdr	nd_opt_each.tgt_lladdr
 #define nd_opts_pi		nd_opt_each.pi_beg
 #define nd_opts_pi_end		nd_opt_each.pi_end
 #define nd_opts_rh		nd_opt_each.rh
 #define nd_opts_mtu		nd_opt_each.mtu
 #define nd_opts_nonce		nd_opt_each.nonce
 #define nd_opts_search		nd_opt_each.search
 #define nd_opts_last		nd_opt_each.last
 #define nd_opts_done		nd_opt_each.done
 
 /* XXX: need nd6_var.h?? */
 /* nd6.c */
 void nd6_init(void);
 #ifdef VIMAGE
 void nd6_destroy(void);
 #endif
 struct nd_ifinfo *nd6_ifattach(struct ifnet *);
 void nd6_ifdetach(struct ifnet *, struct nd_ifinfo *);
 int nd6_is_addr_neighbor(const struct sockaddr_in6 *, struct ifnet *);
 void nd6_option_init(void *, int, union nd_opts *);
 struct nd_opt_hdr *nd6_option(union nd_opts *);
 int nd6_options(union nd_opts *);
 struct llentry *nd6_lookup(const struct in6_addr *, int, struct ifnet *);
 void nd6_setmtu(struct ifnet *);
 void nd6_llinfo_setstate(struct llentry *lle, int newstate);
 void nd6_timer(void *);
 void nd6_purge(struct ifnet *);
 int nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst,
     char *desten, uint32_t *pflags);
 int nd6_resolve(struct ifnet *, int, struct mbuf *,
     const struct sockaddr *, u_char *, uint32_t *, struct llentry **);
 int nd6_ioctl(u_long, caddr_t, struct ifnet *);
 void nd6_cache_lladdr(struct ifnet *, struct in6_addr *,
 	char *, int, int, int);
 bool nd6_try_set_entry_addr(struct ifnet *ifp, struct llentry *lle, char *lladdr);
 struct mbuf *nd6_grab_holdchain(struct llentry *);
 int nd6_flush_holdchain(struct ifnet *, struct llentry *, struct mbuf *);
 void nd6_flush_children_holdchain(struct ifnet *, struct llentry *);
 int nd6_add_ifa_lle(struct in6_ifaddr *);
 void nd6_rem_ifa_lle(struct in6_ifaddr *, int);
 int nd6_output_ifp(struct ifnet *, struct ifnet *, struct mbuf *,
     struct sockaddr_in6 *, struct route *);
 
 struct rib_head;
 struct rib_cmd_info;
 void nd6_subscription_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
     void *arg);
 
 /* nd6_nbr.c */
 void nd6_na_input(struct mbuf *, int, int);
 void nd6_na_output(struct ifnet *, const struct in6_addr *,
 	const struct in6_addr *, u_long, int, struct sockaddr *);
 void nd6_ns_input(struct mbuf *, int, int);
 void nd6_ns_output(struct ifnet *, const struct in6_addr *,
 	const struct in6_addr *, const struct in6_addr *, uint8_t *);
 caddr_t nd6_ifptomac(struct ifnet *);
 void nd6_dad_init(void);
 void nd6_dad_start(struct ifaddr *, int);
 void nd6_dad_stop(struct ifaddr *);
 
 /* nd6_rtr.c */
 void nd6_rs_input(struct mbuf *, int, int);
 void nd6_ra_input(struct mbuf *, int, int);
 void nd6_ifnet_link_event(void *, struct ifnet *, int);
 struct nd_defrouter *defrouter_lookup(const struct in6_addr *, struct ifnet *);
 struct nd_defrouter *defrouter_lookup_locked(const struct in6_addr *,
     struct ifnet *);
 void defrouter_reset(void);
 void defrouter_select_fib(int fibnum);
 void defrouter_rele(struct nd_defrouter *);
 bool defrouter_remove(struct in6_addr *, struct ifnet *);
 bool nd6_defrouter_list_empty(void);
 void nd6_defrouter_flush_all(void);
 void nd6_defrouter_purge(struct ifnet *);
 void nd6_defrouter_timer(void);
 void nd6_defrouter_init(void);
 int nd6_prelist_add(struct nd_prefixctl *, struct nd_defrouter *,
     struct nd_prefix **);
 void nd6_prefix_unlink(struct nd_prefix *, struct nd_prhead *);
 void nd6_prefix_del(struct nd_prefix *);
 void nd6_prefix_ref(struct nd_prefix *);
 void nd6_prefix_rele(struct nd_prefix *);
 int nd6_prefix_offlink(struct nd_prefix *);
 void pfxlist_onlink_check(void);
 struct nd_prefix *nd6_prefix_lookup(struct nd_prefixctl *);
 void rt6_flush(struct in6_addr *, struct ifnet *);
 int nd6_setdefaultiface(int);
 int in6_tmpifadd(const struct in6_ifaddr *, int, int);
 
 #endif /* _KERNEL */
 
 #endif /* _NETINET6_ND6_H_ */
diff --git a/sys/netinet6/nd6_nbr.c b/sys/netinet6/nd6_nbr.c
index cd7119c9ccbc..a4fb5f75fa93 100644
--- a/sys/netinet6/nd6_nbr.c
+++ b/sys/netinet6/nd6_nbr.c
@@ -1,1613 +1,1614 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6_nbr.c,v 1.86 2002/01/21 02:33:04 jinmei Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/libkern.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/errno.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
 #include <sys/callout.h>
 #include <sys/refcount.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <net/if_llatbl.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/icmp6.h>
 #include <netinet/ip_carp.h>
 #include <netinet6/send.h>
 
 #define SDL(s) ((struct sockaddr_dl *)s)
 
 struct dadq;
 static struct dadq *nd6_dad_find(struct ifaddr *, struct nd_opt_nonce *);
 static void nd6_dad_add(struct dadq *dp);
 static void nd6_dad_del(struct dadq *dp);
 static void nd6_dad_rele(struct dadq *);
 static void nd6_dad_starttimer(struct dadq *, int);
 static void nd6_dad_stoptimer(struct dadq *);
 static void nd6_dad_timer(void *);
 static void nd6_dad_duplicated(struct ifaddr *, struct dadq *);
 static void nd6_dad_ns_output(struct dadq *);
 static void nd6_dad_ns_input(struct ifaddr *, struct nd_opt_nonce *);
 static void nd6_dad_na_input(struct ifaddr *);
 static void nd6_na_output_fib(struct ifnet *, const struct in6_addr *,
     const struct in6_addr *, u_long, int, struct sockaddr *, u_int);
 static void nd6_ns_output_fib(struct ifnet *, const struct in6_addr *,
     const struct in6_addr *, const struct in6_addr *, uint8_t *, u_int);
 
 static struct ifaddr *nd6_proxy_fill_sdl(struct ifnet *,
     const struct in6_addr *, struct sockaddr_dl *);
 
 VNET_DEFINE_STATIC(int, dad_enhanced) = 1;
 #define	V_dad_enhanced			VNET(dad_enhanced)
 
 SYSCTL_DECL(_net_inet6_ip6);
 SYSCTL_INT(_net_inet6_ip6, OID_AUTO, dad_enhanced, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(dad_enhanced), 0,
     "Enable Enhanced DAD, which adds a random nonce to NS messages for DAD.");
 
 VNET_DEFINE_STATIC(int, dad_maxtry) = 15;	/* max # of *tries* to
 						   transmit DAD packet */
 #define	V_dad_maxtry			VNET(dad_maxtry)
 
 /*
  * Input a Neighbor Solicitation Message.
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicate address detection)
  */
 void
 nd6_ns_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6;
 	struct nd_neighbor_solicit *nd_ns;
 	struct in6_addr daddr6, myaddr6, saddr6, taddr6;
 	struct ifaddr *ifa;
 	struct sockaddr_dl proxydl;
 	union nd_opts ndopts;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 	char *lladdr;
 	int anycast, lladdrlen, proxy, rflag, tentative, tlladdr;
 
 	ifa = NULL;
 
 	/* RFC 6980: Nodes MUST silently ignore fragments */
 	if(m->m_flags & M_FRAGMENTED)
 		goto freeit;
 
 	ifp = m->m_pkthdr.rcvif;
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (__predict_false(ip6->ip6_hlim != 255)) {
 		ICMP6STAT_INC(icp6s_invlhlim);
 		nd6log((LOG_ERR,
 		    "nd6_ns_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bads;
 	}
 
 	if (m->m_len < off + icmp6len) {
 		m = m_pullup(m, off + icmp6len);
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	nd_ns = (struct nd_neighbor_solicit *)((caddr_t)ip6 + off);
 
 	saddr6 = ip6->ip6_src;
 	daddr6 = ip6->ip6_dst;
 	taddr6 = nd_ns->nd_ns_target;
 	if (in6_setscope(&taddr6, ifp, NULL) != 0)
 		goto bad;
 
 	rflag = (V_ip6_forwarding) ? ND_NA_FLAG_ROUTER : 0;
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV && V_ip6_norbit_raif)
 		rflag = 0;
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
 		/* dst has to be a solicited node multicast address. */
 		if (daddr6.s6_addr16[0] == IPV6_ADDR_INT16_MLL &&
 		    /* don't check ifindex portion */
 		    daddr6.s6_addr32[1] == 0 &&
 		    daddr6.s6_addr32[2] == IPV6_ADDR_INT32_ONE &&
 		    daddr6.s6_addr8[12] == 0xff) {
 			; /* good */
 		} else {
 			nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
 			    "(wrong ip6 dst)\n"));
 			goto bad;
 		}
 	} else if (!V_nd6_onlink_ns_rfc4861) {
 		struct sockaddr_in6 src_sa6;
 
 		/*
 		 * According to recent IETF discussions, it is not a good idea
 		 * to accept a NS from an address which would not be deemed
 		 * to be a neighbor otherwise.  This point is expected to be
 		 * clarified in future revisions of the specification.
 		 */
 		bzero(&src_sa6, sizeof(src_sa6));
 		src_sa6.sin6_family = AF_INET6;
 		src_sa6.sin6_len = sizeof(src_sa6);
 		src_sa6.sin6_addr = saddr6;
 		if (nd6_is_addr_neighbor(&src_sa6, ifp) == 0) {
 			nd6log((LOG_INFO, "nd6_ns_input: "
 				"NS packet from non-neighbor\n"));
 			goto bad;
 		}
 	}
 
 	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
 		nd6log((LOG_INFO, "nd6_ns_input: bad NS target (multicast)\n"));
 		goto bad;
 	}
 
 	icmp6len -= sizeof(*nd_ns);
 	nd6_option_init(nd_ns + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_ns_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	lladdr = NULL;
 	lladdrlen = 0;
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && lladdr) {
 		nd6log((LOG_INFO, "nd6_ns_input: bad DAD packet "
 		    "(link-layer address option)\n"));
 		goto bad;
 	}
 
 	/*
 	 * Attaching target link-layer address to the NA?
 	 * (RFC 2461 7.2.4)
 	 *
 	 * NS IP dst is unicast/anycast			MUST NOT add
 	 * NS IP dst is solicited-node multicast	MUST add
 	 *
 	 * In implementation, we add target link-layer address by default.
 	 * We do not add one in MUST NOT cases.
 	 */
 	if (!IN6_IS_ADDR_MULTICAST(&daddr6))
 		tlladdr = 0;
 	else
 		tlladdr = 1;
 
 	/*
 	 * Target address (taddr6) must be either:
 	 * (1) Valid unicast/anycast address for my receiving interface,
 	 * (2) Unicast address for which I'm offering proxy service, or
 	 * (3) "tentative" address on which DAD is being performed.
 	 */
 	/* (1) and (3) check. */
 	if (ifp->if_carp)
 		ifa = (*carp_iamatch6_p)(ifp, &taddr6);
 	else
 		ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
 
 	/* (2) check. */
 	proxy = 0;
 	if (ifa == NULL) {
 		if ((ifa = nd6_proxy_fill_sdl(ifp, &taddr6, &proxydl)) != NULL)
 			proxy = 1;
 	}
 	if (ifa == NULL) {
 		/*
 		 * We've got an NS packet, and we don't have that adddress
 		 * assigned for us.  We MUST silently ignore it.
 		 * See RFC2461 7.2.3.
 		 */
 		goto freeit;
 	}
 	myaddr6 = *IFA_IN6(ifa);
 	anycast = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST;
 	tentative = ((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE;
 	if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DUPLICATED)
 		goto freeit;
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO, "nd6_ns_input: lladdrlen mismatch for %s "
 		    "(if %d, NS packet %d)\n",
 		    ip6_sprintf(ip6bufs, &taddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	if (IN6_ARE_ADDR_EQUAL(&myaddr6, &saddr6)) {
 		nd6log((LOG_INFO, "nd6_ns_input: duplicate IP6 address %s\n",
 		    ip6_sprintf(ip6bufs, &saddr6)));
 		goto freeit;
 	}
 
 	/*
 	 * We have neighbor solicitation packet, with target address equals to
 	 * one of my tentative address.
 	 *
 	 * src addr	how to process?
 	 * ---		---
 	 * multicast	of course, invalid (rejected in ip6_input)
 	 * unicast	somebody is doing address resolution -> ignore
 	 * unspec	dup address detection
 	 *
 	 * The processing is defined in RFC 2462.
 	 */
 	if (tentative) {
 		/*
 		 * If source address is unspecified address, it is for
 		 * duplicate address detection.
 		 *
 		 * If not, the packet is for addess resolution;
 		 * silently ignore it.
 		 */
 		if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
 			nd6_dad_ns_input(ifa, ndopts.nd_opts_nonce);
 
 		goto freeit;
 	}
 
 	/*
 	 * If the source address is unspecified address, entries must not
 	 * be created or updated.
 	 * It looks that sender is performing DAD.  Output NA toward
 	 * all-node multicast address, to tell the sender that I'm using
 	 * the address.
 	 * S bit ("solicited") must be zero.
 	 */
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6)) {
 		struct in6_addr in6_all;
 
 		in6_all = in6addr_linklocal_allnodes;
 		if (in6_setscope(&in6_all, ifp, NULL) != 0)
 			goto bad;
 		nd6_na_output_fib(ifp, &in6_all, &taddr6,
 		    ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) |
 		    rflag, tlladdr, proxy ? (struct sockaddr *)&proxydl : NULL,
 		    M_GETFIB(m));
 		goto freeit;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen,
 	    ND_NEIGHBOR_SOLICIT, 0);
 
 	nd6_na_output_fib(ifp, &saddr6, &taddr6,
 	    ((anycast || proxy || !tlladdr) ? 0 : ND_NA_FLAG_OVERRIDE) |
 	    rflag | ND_NA_FLAG_SOLICITED, tlladdr,
 	    proxy ? (struct sockaddr *)&proxydl : NULL, M_GETFIB(m));
  freeit:
 	if (ifa != NULL)
 		ifa_free(ifa);
 	m_freem(m);
 	return;
 
  bad:
 	nd6log((LOG_ERR, "nd6_ns_input: src=%s\n",
 		ip6_sprintf(ip6bufs, &saddr6)));
 	nd6log((LOG_ERR, "nd6_ns_input: dst=%s\n",
 		ip6_sprintf(ip6bufs, &daddr6)));
 	nd6log((LOG_ERR, "nd6_ns_input: tgt=%s\n",
 		ip6_sprintf(ip6bufs, &taddr6)));
  bads:
 	ICMP6STAT_INC(icp6s_badns);
 	if (ifa != NULL)
 		ifa_free(ifa);
 	m_freem(m);
 }
 
 static struct ifaddr *
 nd6_proxy_fill_sdl(struct ifnet *ifp, const struct in6_addr *taddr6,
     struct sockaddr_dl *sdl)
 {
 	struct ifaddr *ifa;
 	struct llentry *ln;
 
 	ifa = NULL;
 	ln = nd6_lookup(taddr6, LLE_SF(AF_INET6, 0), ifp);
 	if (ln == NULL)
 		return (ifa);
 	if ((ln->la_flags & (LLE_PUB | LLE_VALID)) == (LLE_PUB | LLE_VALID)) {
 		link_init_sdl(ifp, (struct sockaddr *)sdl, ifp->if_type);
 		sdl->sdl_alen = ifp->if_addrlen;
 		bcopy(ln->ll_addr, &sdl->sdl_data, ifp->if_addrlen);
 		LLE_RUNLOCK(ln);
 		ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp,
 		    IN6_IFF_NOTREADY|IN6_IFF_ANYCAST);
 	} else
 		LLE_RUNLOCK(ln);
 
 	return (ifa);
 }
 
 /*
  * Output a Neighbor Solicitation Message. Caller specifies:
  *	- ICMP6 header source IP6 address
  *	- ND6 header target IP6 address
  *	- ND6 header source datalink address
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicate address detection)
  *
  *    ln - for source address determination
  * nonce - If non-NULL, NS is used for duplicate address detection and
  *         the value (length is ND_OPT_NONCE_LEN) is used as a random nonce.
  */
 static void
 nd6_ns_output_fib(struct ifnet *ifp, const struct in6_addr *saddr6,
     const struct in6_addr *daddr6, const struct in6_addr *taddr6,
     uint8_t *nonce, u_int fibnum)
 {
 	struct mbuf *m;
 	struct m_tag *mtag;
 	struct ip6_hdr *ip6;
 	struct nd_neighbor_solicit *nd_ns;
 	struct ip6_moptions im6o;
 	int icmp6len;
 	int maxlen;
 
 	NET_EPOCH_ASSERT();
 
 	if (IN6_IS_ADDR_MULTICAST(taddr6))
 		return;
 
 	/* estimate the size of message */
 	maxlen = sizeof(*ip6) + sizeof(*nd_ns);
 	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
 	KASSERT(max_linkhdr + maxlen <= MCLBYTES, (
 	    "%s: max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)",
 	    __func__, max_linkhdr, maxlen, MCLBYTES));
 
 	if (max_linkhdr + maxlen > MHLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return;
 	M_SETFIB(m, fibnum);
 
 	if (daddr6 == NULL || IN6_IS_ADDR_MULTICAST(daddr6)) {
 		m->m_flags |= M_MCAST;
 		im6o.im6o_multicast_ifp = ifp;
 		im6o.im6o_multicast_hlim = 255;
 		im6o.im6o_multicast_loop = 0;
 	}
 
 	icmp6len = sizeof(*nd_ns);
 	m->m_pkthdr.len = m->m_len = sizeof(*ip6) + icmp6len;
 	m->m_data += max_linkhdr;	/* or M_ALIGN() equivalent? */
 
 	/* fill neighbor solicitation packet */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	/* ip6->ip6_plen will be set later */
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	if (daddr6)
 		ip6->ip6_dst = *daddr6;
 	else {
 		ip6->ip6_dst.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 		ip6->ip6_dst.s6_addr16[1] = 0;
 		ip6->ip6_dst.s6_addr32[1] = 0;
 		ip6->ip6_dst.s6_addr32[2] = IPV6_ADDR_INT32_ONE;
 		ip6->ip6_dst.s6_addr32[3] = taddr6->s6_addr32[3];
 		ip6->ip6_dst.s6_addr8[12] = 0xff;
 		if (in6_setscope(&ip6->ip6_dst, ifp, NULL) != 0)
 			goto bad;
 	}
 	if (nonce == NULL) {
 		char ip6buf[INET6_ADDRSTRLEN];
 		struct ifaddr *ifa = NULL;
 
 		/*
 		 * RFC2461 7.2.2:
 		 * "If the source address of the packet prompting the
 		 * solicitation is the same as one of the addresses assigned
 		 * to the outgoing interface, that address SHOULD be placed
 		 * in the IP Source Address of the outgoing solicitation.
 		 * Otherwise, any one of the addresses assigned to the
 		 * interface should be used."
 		 *
 		 * We use the source address for the prompting packet
 		 * (saddr6), if saddr6 belongs to the outgoing interface.
 		 * Otherwise, we perform the source address selection as usual.
 		 */
 		if (saddr6 != NULL)
 			ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, saddr6);
 		if (ifa == NULL) {
 			int error;
 			struct in6_addr dst6, src6;
 			uint32_t scopeid;
 
 			in6_splitscope(&ip6->ip6_dst, &dst6, &scopeid);
 			error = in6_selectsrc_addr(fibnum, &dst6,
 			    scopeid, ifp, &src6, NULL);
 			if (error) {
 				nd6log((LOG_DEBUG, "%s: source can't be "
 				    "determined: dst=%s, error=%d\n", __func__,
 				    ip6_sprintf(ip6buf, &dst6),
 				    error));
 				goto bad;
 			}
 			ip6->ip6_src = src6;
 		} else
 			ip6->ip6_src = *saddr6;
 
 		if (ifp->if_carp != NULL) {
 			/*
 			 * Check that selected source address belongs to
 			 * CARP addresses.
 			 */
 			if (ifa == NULL)
 				ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp,
 				    &ip6->ip6_src);
 			/*
 			 * Do not send NS for CARP address if we are not
 			 * the CARP master.
 			 */
 			if (ifa != NULL && ifa->ifa_carp != NULL &&
 			    !(*carp_master_p)(ifa)) {
 				log(LOG_DEBUG,
 				    "nd6_ns_output: NS from BACKUP CARP address %s\n",
 				    ip6_sprintf(ip6buf, &ip6->ip6_src));
 				ifa_free(ifa);
 				goto bad;
 			}
 		}
 		if (ifa != NULL)
 			ifa_free(ifa);
 	} else {
 		/*
 		 * Source address for DAD packet must always be IPv6
 		 * unspecified address. (0::0)
 		 * We actually don't have to 0-clear the address (we did it
 		 * above), but we do so here explicitly to make the intention
 		 * clearer.
 		 */
 		bzero(&ip6->ip6_src, sizeof(ip6->ip6_src));
 	}
 	nd_ns = (struct nd_neighbor_solicit *)(ip6 + 1);
 	nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT;
 	nd_ns->nd_ns_code = 0;
 	nd_ns->nd_ns_reserved = 0;
 	nd_ns->nd_ns_target = *taddr6;
 	in6_clearscope(&nd_ns->nd_ns_target); /* XXX */
 
 	/*
 	 * Add source link-layer address option.
 	 *
 	 *				spec		implementation
 	 *				---		---
 	 * DAD packet			MUST NOT	do not add the option
 	 * there's no link layer address:
 	 *				impossible	do not add the option
 	 * there's link layer address:
 	 *	Multicast NS		MUST add one	add the option
 	 *	Unicast NS		SHOULD add one	add the option
 	 */
 	if (nonce == NULL) {
 		struct nd_opt_hdr *nd_opt;
 		char *mac;
 		int optlen;
 
 		mac = NULL;
 		if (ifp->if_carp)
 			mac = (*carp_macmatch6_p)(ifp, m, &ip6->ip6_src);
 		if (mac == NULL)
 			mac = nd6_ifptomac(ifp);
 
 		if (mac != NULL) {
 			nd_opt = (struct nd_opt_hdr *)(nd_ns + 1);
 			optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
 			/* 8 byte alignments... */
 			optlen = (optlen + 7) & ~7;
 			m->m_pkthdr.len += optlen;
 			m->m_len += optlen;
 			icmp6len += optlen;
 			bzero(nd_opt, optlen);
 			nd_opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR;
 			nd_opt->nd_opt_len = optlen >> 3;
 			bcopy(mac, nd_opt + 1, ifp->if_addrlen);
 		}
 	}
 	/*
 	 * Add a Nonce option (RFC 3971) to detect looped back NS messages.
 	 * This behavior is documented as Enhanced Duplicate Address
 	 * Detection in RFC 7527.
 	 * net.inet6.ip6.dad_enhanced=0 disables this.
 	 */
 	if (V_dad_enhanced != 0 && nonce != NULL) {
 		int optlen = sizeof(struct nd_opt_hdr) + ND_OPT_NONCE_LEN;
 		struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_ns + 1);
 		/* 8-byte alignment is required. */
 		optlen = (optlen + 7) & ~7;
 
 		m->m_pkthdr.len += optlen;
 		m->m_len += optlen;
 		icmp6len += optlen;
 		bzero((caddr_t)nd_opt, optlen);
 		nd_opt->nd_opt_type = ND_OPT_NONCE;
 		nd_opt->nd_opt_len = optlen >> 3;
 		bcopy(nonce, (caddr_t)(nd_opt + 1), ND_OPT_NONCE_LEN);
 	}
 	ip6->ip6_plen = htons((u_short)icmp6len);
 	nd_ns->nd_ns_cksum = 0;
 	nd_ns->nd_ns_cksum =
 	    in6_cksum(m, IPPROTO_ICMPV6, sizeof(*ip6), icmp6len);
 
 	if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
 			sizeof(unsigned short), M_NOWAIT);
 		if (mtag == NULL)
 			goto bad;
 		*(unsigned short *)(mtag + 1) = nd_ns->nd_ns_type;
 		m_tag_prepend(m, mtag);
 	}
 
 	ip6_output(m, NULL, NULL, (nonce != NULL) ? IPV6_UNSPECSRC : 0,
 	    &im6o, NULL, NULL);
 	icmp6_ifstat_inc(ifp, ifs6_out_msg);
 	icmp6_ifstat_inc(ifp, ifs6_out_neighborsolicit);
 	ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_SOLICIT]);
 
 	return;
 
   bad:
 	m_freem(m);
 }
 
 #ifndef BURN_BRIDGES
 void
 nd6_ns_output(struct ifnet *ifp, const struct in6_addr *saddr6,
     const struct in6_addr *daddr6, const struct in6_addr *taddr6,uint8_t *nonce)
 {
 
 	nd6_ns_output_fib(ifp, saddr6, daddr6, taddr6, nonce, RT_DEFAULT_FIB);
 }
 #endif
 /*
  * Neighbor advertisement input handling.
  *
  * Based on RFC 2461
  * Based on RFC 2462 (duplicate address detection)
  *
  * the following items are not implemented yet:
  * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
  * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
  */
 void
 nd6_na_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6;
 	struct ifaddr *ifa;
 	struct llentry *ln;
 	struct mbuf *chain;
 	struct nd_neighbor_advert *nd_na;
 	struct in6_addr daddr6, taddr6;
 	union nd_opts ndopts;
 	u_char linkhdr[LLE_MAX_LINKHDR];
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 	char *lladdr;
 	size_t linkhdrsize;
 	int flags, is_override, is_router, is_solicited;
 	int lladdr_off, lladdrlen, checklink;
 	bool flush_holdchain = false;
 
 	NET_EPOCH_ASSERT();
 
 	chain = NULL;
 	ln = NULL;
 	checklink = 0;
 
 	/* RFC 6980: Nodes MUST silently ignore fragments */
 	if(m->m_flags & M_FRAGMENTED)
 		goto freeit;
 
 	ifp = m->m_pkthdr.rcvif;
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (__predict_false(ip6->ip6_hlim != 255)) {
 		ICMP6STAT_INC(icp6s_invlhlim);
 		nd6log((LOG_ERR,
 		    "nd6_na_input: invalid hlim (%d) from %s to %s on %s\n",
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	if (m->m_len < off + icmp6len) {
 		m = m_pullup(m, off + icmp6len);
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	nd_na = (struct nd_neighbor_advert *)((caddr_t)ip6 + off);
 
 	flags = nd_na->nd_na_flags_reserved;
 	is_router = ((flags & ND_NA_FLAG_ROUTER) != 0);
 	is_solicited = ((flags & ND_NA_FLAG_SOLICITED) != 0);
 	is_override = ((flags & ND_NA_FLAG_OVERRIDE) != 0);
 
 	taddr6 = nd_na->nd_na_target;
 	if (in6_setscope(&taddr6, ifp, NULL))
 		goto bad;	/* XXX: impossible */
 
 	if (IN6_IS_ADDR_MULTICAST(&taddr6)) {
 		nd6log((LOG_ERR,
 		    "nd6_na_input: invalid target address %s\n",
 		    ip6_sprintf(ip6bufs, &taddr6)));
 		goto bad;
 	}
 
 	daddr6 = ip6->ip6_dst;
 	if (IN6_IS_ADDR_MULTICAST(&daddr6))
 		if (is_solicited) {
 			nd6log((LOG_ERR,
 			    "nd6_na_input: a solicited adv is multicasted\n"));
 			goto bad;
 		}
 
 	icmp6len -= sizeof(*nd_na);
 	nd6_option_init(nd_na + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "nd6_na_input: invalid ND option, ignored\n"));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	lladdr = NULL;
 	lladdrlen = 0;
 	if (ndopts.nd_opts_tgt_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_tgt_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_tgt_lladdr->nd_opt_len << 3;
 	}
 
 	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp, &taddr6);
 	if (ifa != NULL && ifa->ifa_carp != NULL) {
 		/*
 		 * Silently ignore NAs for CARP addresses if we are not
 		 * the CARP master.
 		 */
 		if (!(*carp_master_p)(ifa)) {
 			log(LOG_DEBUG,
 			    "nd6_na_input: NA for BACKUP CARP address %s\n",
 			    ip6_sprintf(ip6bufs, &taddr6));
 			ifa_free(ifa);
 			goto freeit;
 		}
 	}
 	/*
 	 * Target address matches one of my interface address.
 	 *
 	 * If my address is tentative, this means that there's somebody
 	 * already using the same address as mine.  This indicates DAD failure.
 	 * This is defined in RFC 2462.
 	 *
 	 * Otherwise, process as defined in RFC 2461.
 	 */
 	if (ifa
 	 && (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_TENTATIVE)) {
 		nd6_dad_na_input(ifa);
 		ifa_free(ifa);
 		goto freeit;
 	}
 
 	/* Just for safety, maybe unnecessary. */
 	if (ifa) {
 		ifa_free(ifa);
 		log(LOG_ERR,
 		    "nd6_na_input: duplicate IP6 address %s\n",
 		    ip6_sprintf(ip6bufs, &taddr6));
 		goto freeit;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO, "nd6_na_input: lladdrlen mismatch for %s "
 		    "(if %d, NA packet %d)\n", ip6_sprintf(ip6bufs, &taddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	/*
 	 * If no neighbor cache entry is found, NA SHOULD silently be
 	 * discarded.
 	 */
 	ln = nd6_lookup(&taddr6, LLE_SF(AF_INET6, LLE_EXCLUSIVE), ifp);
 	if (ln == NULL) {
 		goto freeit;
 	}
 
 	/*
 	 * Do not try to override static entry.
 	 */
 	if (ln->la_flags & LLE_STATIC)
 		goto freeit;
 
 	if (ln->ln_state == ND6_LLINFO_INCOMPLETE) {
 		/*
 		 * If the link-layer has address, and no lladdr option came,
 		 * discard the packet.
 		 */
 		if (ifp->if_addrlen && lladdr == NULL) {
 			goto freeit;
 		}
 
 		/*
 		 * Record link-layer address, and update the state.
 		 */
 		if (!nd6_try_set_entry_addr(ifp, ln, lladdr))
 			goto freeit;
 
 		flush_holdchain = true;
 		if (is_solicited)
 			nd6_llinfo_setstate(ln, ND6_LLINFO_REACHABLE);
 		else
 			nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 		EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED);
 		if ((ln->ln_router = is_router) != 0) {
 			/*
 			 * This means a router's state has changed from
 			 * non-reachable to probably reachable, and might
 			 * affect the status of associated prefixes..
 			 */
 			checklink = 1;
 		}
 	} else {
 		int llchange;
 
 		/*
 		 * Check if the link-layer address has changed or not.
 		 */
 		if (lladdr == NULL)
 			llchange = 0;
 		else {
 			if (ln->la_flags & LLE_VALID) {
 				if (bcmp(lladdr, ln->ll_addr, ifp->if_addrlen))
 					llchange = 1;
 				else
 					llchange = 0;
 			} else
 				llchange = 1;
 		}
 
 		/*
 		 * This is VERY complex.  Look at it with care.
 		 *
 		 * override solicit lladdr llchange	action
 		 *					(L: record lladdr)
 		 *
 		 *	0	0	n	--	(2c)
 		 *	0	0	y	n	(2b) L
 		 *	0	0	y	y	(1)    REACHABLE->STALE
 		 *	0	1	n	--	(2c)   *->REACHABLE
 		 *	0	1	y	n	(2b) L *->REACHABLE
 		 *	0	1	y	y	(1)    REACHABLE->STALE
 		 *	1	0	n	--	(2a)
 		 *	1	0	y	n	(2a) L
 		 *	1	0	y	y	(2a) L *->STALE
 		 *	1	1	n	--	(2a)   *->REACHABLE
 		 *	1	1	y	n	(2a) L *->REACHABLE
 		 *	1	1	y	y	(2a) L *->REACHABLE
 		 */
 		if (!is_override && (lladdr != NULL && llchange)) {  /* (1) */
 			/*
 			 * If state is REACHABLE, make it STALE.
 			 * no other updates should be done.
 			 */
 			if (ln->ln_state == ND6_LLINFO_REACHABLE)
 				nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 			goto freeit;
 		} else if (is_override				   /* (2a) */
 			|| (!is_override && (lladdr != NULL && !llchange)) /* (2b) */
 			|| lladdr == NULL) {			   /* (2c) */
 			/*
 			 * Update link-local address, if any.
 			 */
 			if (lladdr != NULL) {
 				linkhdrsize = sizeof(linkhdr);
 				if (lltable_calc_llheader(ifp, AF_INET6, lladdr,
 				    linkhdr, &linkhdrsize, &lladdr_off) != 0)
 					goto freeit;
 				if (lltable_try_set_entry_addr(ifp, ln, linkhdr,
 				    linkhdrsize, lladdr_off) == 0)
 					goto freeit;
 				EVENTHANDLER_INVOKE(lle_event, ln,
 				    LLENTRY_RESOLVED);
 			}
 
 			/*
 			 * If solicited, make the state REACHABLE.
 			 * If not solicited and the link-layer address was
 			 * changed, make it STALE.
 			 */
 			if (is_solicited)
 				nd6_llinfo_setstate(ln, ND6_LLINFO_REACHABLE);
 			else {
 				if (lladdr != NULL && llchange)
 					nd6_llinfo_setstate(ln, ND6_LLINFO_STALE);
 			}
 		}
 
 		if (ln->ln_router && !is_router) {
 			/*
 			 * The peer dropped the router flag.
 			 * Remove the sender from the Default Router List and
 			 * update the Destination Cache entries.
 			 */
 			struct ifnet *nd6_ifp;
 
 			nd6_ifp = lltable_get_ifp(ln->lle_tbl);
 			if (!defrouter_remove(&ln->r_l3addr.addr6, nd6_ifp) &&
 			    (ND_IFINFO(nd6_ifp)->flags &
 			     ND6_IFF_ACCEPT_RTADV) != 0)
 				/*
 				 * Even if the neighbor is not in the default
 				 * router list, the neighbor may be used as a
 				 * next hop for some destinations (e.g. redirect
 				 * case). So we must call rt6_flush explicitly.
 				 */
 				rt6_flush(&ip6->ip6_src, ifp);
 		}
 		ln->ln_router = is_router;
 	}
         /* XXX - QL
 	 *  Does this matter?
 	 *  rt->rt_flags &= ~RTF_REJECT;
 	 */
 	ln->la_asked = 0;
 	if (ln->la_hold != NULL)
 		chain = nd6_grab_holdchain(ln);
  freeit:
 	if (ln != NULL)
 		LLE_WUNLOCK(ln);
 
 	if (chain != NULL)
 		nd6_flush_holdchain(ifp, ln, chain);
 	if (flush_holdchain)
 		nd6_flush_children_holdchain(ifp, ln);
 
 	if (checklink)
 		pfxlist_onlink_check();
 
 	m_freem(m);
 	return;
 
  bad:
 	if (ln != NULL)
 		LLE_WUNLOCK(ln);
 
 	ICMP6STAT_INC(icp6s_badna);
 	m_freem(m);
 }
 
 /*
  * Neighbor advertisement output handling.
  *
  * Based on RFC 2461
  *
  * the following items are not implemented yet:
  * - proxy advertisement delay rule (RFC2461 7.2.8, last paragraph, SHOULD)
  * - anycast advertisement delay rule (RFC2461 7.2.7, SHOULD)
  *
  * tlladdr - 1 if include target link-layer address
  * sdl0 - sockaddr_dl (= proxy NA) or NULL
  */
 static void
 nd6_na_output_fib(struct ifnet *ifp, const struct in6_addr *daddr6_0,
     const struct in6_addr *taddr6, u_long flags, int tlladdr,
     struct sockaddr *sdl0, u_int fibnum)
 {
 	struct mbuf *m;
 	struct m_tag *mtag;
 	struct ip6_hdr *ip6;
 	struct nd_neighbor_advert *nd_na;
 	struct ip6_moptions im6o;
 	struct in6_addr daddr6, dst6, src6;
 	uint32_t scopeid;
 
 	NET_EPOCH_ASSERT();
 
 	int icmp6len, maxlen, error;
 	caddr_t mac = NULL;
 
 	daddr6 = *daddr6_0;	/* make a local copy for modification */
 
 	/* estimate the size of message */
 	maxlen = sizeof(*ip6) + sizeof(*nd_na);
 	maxlen += (sizeof(struct nd_opt_hdr) + ifp->if_addrlen + 7) & ~7;
 	KASSERT(max_linkhdr + maxlen <= MCLBYTES, (
 	    "%s: max_linkhdr + maxlen > MCLBYTES (%d + %d > %d)",
 	    __func__, max_linkhdr, maxlen, MCLBYTES));
 
 	if (max_linkhdr + maxlen > MHLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return;
 	M_SETFIB(m, fibnum);
 
 	if (IN6_IS_ADDR_MULTICAST(&daddr6)) {
 		m->m_flags |= M_MCAST;
 		im6o.im6o_multicast_ifp = ifp;
 		im6o.im6o_multicast_hlim = 255;
 		im6o.im6o_multicast_loop = 0;
 	}
 
 	icmp6len = sizeof(*nd_na);
 	m->m_pkthdr.len = m->m_len = sizeof(struct ip6_hdr) + icmp6len;
 	m->m_data += max_linkhdr;	/* or M_ALIGN() equivalent? */
 
 	/* fill neighbor advertisement packet */
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow = 0;
 	ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc |= IPV6_VERSION;
 	ip6->ip6_nxt = IPPROTO_ICMPV6;
 	ip6->ip6_hlim = 255;
 	if (IN6_IS_ADDR_UNSPECIFIED(&daddr6)) {
 		/* reply to DAD */
 		daddr6.s6_addr16[0] = IPV6_ADDR_INT16_MLL;
 		daddr6.s6_addr16[1] = 0;
 		daddr6.s6_addr32[1] = 0;
 		daddr6.s6_addr32[2] = 0;
 		daddr6.s6_addr32[3] = IPV6_ADDR_INT32_ONE;
 		if (in6_setscope(&daddr6, ifp, NULL))
 			goto bad;
 
 		flags &= ~ND_NA_FLAG_SOLICITED;
 	}
 	ip6->ip6_dst = daddr6;
 
 	/*
 	 * Select a source whose scope is the same as that of the dest.
 	 */
 	in6_splitscope(&daddr6, &dst6, &scopeid);
 	error = in6_selectsrc_addr(fibnum, &dst6,
 	    scopeid, ifp, &src6, NULL);
 	if (error) {
 		char ip6buf[INET6_ADDRSTRLEN];
 		nd6log((LOG_DEBUG, "nd6_na_output: source can't be "
 		    "determined: dst=%s, error=%d\n",
 		    ip6_sprintf(ip6buf, &daddr6), error));
 		goto bad;
 	}
 	ip6->ip6_src = src6;
 	nd_na = (struct nd_neighbor_advert *)(ip6 + 1);
 	nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
 	nd_na->nd_na_code = 0;
 	nd_na->nd_na_target = *taddr6;
 	in6_clearscope(&nd_na->nd_na_target); /* XXX */
 
 	/*
 	 * "tlladdr" indicates NS's condition for adding tlladdr or not.
 	 * see nd6_ns_input() for details.
 	 * Basically, if NS packet is sent to unicast/anycast addr,
 	 * target lladdr option SHOULD NOT be included.
 	 */
 	if (tlladdr) {
 		/*
 		 * sdl0 != NULL indicates proxy NA.  If we do proxy, use
 		 * lladdr in sdl0.  If we are not proxying (sending NA for
 		 * my address) use lladdr configured for the interface.
 		 */
 		if (sdl0 == NULL) {
 			if (ifp->if_carp)
 				mac = (*carp_macmatch6_p)(ifp, m, taddr6);
 			if (mac == NULL)
 				mac = nd6_ifptomac(ifp);
 		} else if (sdl0->sa_family == AF_LINK) {
 			struct sockaddr_dl *sdl;
 			sdl = (struct sockaddr_dl *)sdl0;
 			if (sdl->sdl_alen == ifp->if_addrlen)
 				mac = LLADDR(sdl);
 		}
 	}
 	if (tlladdr && mac) {
 		int optlen = sizeof(struct nd_opt_hdr) + ifp->if_addrlen;
 		struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)(nd_na + 1);
 
 		/* roundup to 8 bytes alignment! */
 		optlen = (optlen + 7) & ~7;
 
 		m->m_pkthdr.len += optlen;
 		m->m_len += optlen;
 		icmp6len += optlen;
 		bzero((caddr_t)nd_opt, optlen);
 		nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR;
 		nd_opt->nd_opt_len = optlen >> 3;
 		bcopy(mac, (caddr_t)(nd_opt + 1), ifp->if_addrlen);
 	} else
 		flags &= ~ND_NA_FLAG_OVERRIDE;
 
 	ip6->ip6_plen = htons((u_short)icmp6len);
 	nd_na->nd_na_flags_reserved = flags;
 	nd_na->nd_na_cksum = 0;
 	nd_na->nd_na_cksum =
 	    in6_cksum(m, IPPROTO_ICMPV6, sizeof(struct ip6_hdr), icmp6len);
 
 	if (send_sendso_input_hook != NULL) {
 		mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
 		    sizeof(unsigned short), M_NOWAIT);
 		if (mtag == NULL)
 			goto bad;
 		*(unsigned short *)(mtag + 1) = nd_na->nd_na_type;
 		m_tag_prepend(m, mtag);
 	}
 
 	ip6_output(m, NULL, NULL, 0, &im6o, NULL, NULL);
 	icmp6_ifstat_inc(ifp, ifs6_out_msg);
 	icmp6_ifstat_inc(ifp, ifs6_out_neighboradvert);
 	ICMP6STAT_INC(icp6s_outhist[ND_NEIGHBOR_ADVERT]);
 
 	return;
 
   bad:
 	m_freem(m);
 }
 
 #ifndef BURN_BRIDGES
 void
 nd6_na_output(struct ifnet *ifp, const struct in6_addr *daddr6_0,
     const struct in6_addr *taddr6, u_long flags, int tlladdr,
     struct sockaddr *sdl0)
 {
 
 	nd6_na_output_fib(ifp, daddr6_0, taddr6, flags, tlladdr, sdl0,
 	    RT_DEFAULT_FIB);
 }
 #endif
 
 caddr_t
 nd6_ifptomac(struct ifnet *ifp)
 {
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_IEEE1394:
 	case IFT_L2VLAN:
 	case IFT_INFINIBAND:
 	case IFT_BRIDGE:
 		return IF_LLADDR(ifp);
 	default:
 		return NULL;
 	}
 }
 
 struct dadq {
 	TAILQ_ENTRY(dadq) dad_list;
 	struct ifaddr *dad_ifa;
 	int dad_count;		/* max NS to send */
 	int dad_ns_tcount;	/* # of trials to send NS */
 	int dad_ns_ocount;	/* NS sent so far */
 	int dad_ns_icount;
 	int dad_na_icount;
 	int dad_ns_lcount;	/* looped back NS */
 	int dad_loopbackprobe;	/* probing state for loopback detection */
 	struct callout dad_timer_ch;
 	struct vnet *dad_vnet;
 	u_int dad_refcnt;
 #define	ND_OPT_NONCE_LEN32 \
 		((ND_OPT_NONCE_LEN + sizeof(uint32_t) - 1)/sizeof(uint32_t))
 	uint32_t dad_nonce[ND_OPT_NONCE_LEN32];
 	bool dad_ondadq;	/* on dadq? Protected by DADQ_WLOCK. */
 };
 
 VNET_DEFINE_STATIC(TAILQ_HEAD(, dadq), dadq);
 VNET_DEFINE_STATIC(struct rwlock, dad_rwlock);
 #define	V_dadq			VNET(dadq)
 #define	V_dad_rwlock		VNET(dad_rwlock)
 
 #define	DADQ_LOCKPTR()		(&V_dad_rwlock)
 #define	DADQ_LOCK_INIT()	rw_init(DADQ_LOCKPTR(), "nd6 DAD queue")
 #define	DADQ_RLOCK()		rw_rlock(DADQ_LOCKPTR())
 #define	DADQ_RUNLOCK()		rw_runlock(DADQ_LOCKPTR())
 #define	DADQ_WLOCK()		rw_wlock(DADQ_LOCKPTR())
 #define	DADQ_WUNLOCK()		rw_wunlock(DADQ_LOCKPTR())
 
 #define	DADQ_LOCK_ASSERT()	rw_assert(DADQ_LOCKPTR(), RA_LOCKED);
 #define	DADQ_RLOCK_ASSERT()	rw_assert(DADQ_LOCKPTR(), RA_RLOCKED);
 #define	DADQ_WLOCK_ASSERT()	rw_assert(DADQ_LOCKPTR(), RA_WLOCKED);
 
 static void
 nd6_dad_add(struct dadq *dp)
 {
 	DADQ_WLOCK_ASSERT();
 
 	TAILQ_INSERT_TAIL(&V_dadq, dp, dad_list);
 	dp->dad_ondadq = true;
 }
 
 static void
 nd6_dad_del(struct dadq *dp)
 {
 	DADQ_WLOCK_ASSERT();
 
 	if (dp->dad_ondadq) {
 		/*
 		 * Remove dp from the dadq and release the dadq's
 		 * reference.
 		 */
 		TAILQ_REMOVE(&V_dadq, dp, dad_list);
 		dp->dad_ondadq = false;
 		nd6_dad_rele(dp);
 	}
 }
 
 static struct dadq *
 nd6_dad_find(struct ifaddr *ifa, struct nd_opt_nonce *n)
 {
 	struct dadq *dp;
 
 	DADQ_LOCK_ASSERT();
 
 	TAILQ_FOREACH(dp, &V_dadq, dad_list) {
 		if (dp->dad_ifa != ifa)
 			continue;
 
 		/*
 		 * Skip if the nonce matches the received one.
 		 * +2 in the length is required because of type and
 		 * length fields are included in a header.
 		 */
 		if (n != NULL &&
 		    n->nd_opt_nonce_len == (ND_OPT_NONCE_LEN + 2) / 8 &&
 		    memcmp(&n->nd_opt_nonce[0], &dp->dad_nonce[0],
 		    ND_OPT_NONCE_LEN) == 0) {
 			dp->dad_ns_lcount++;
 			continue;
 		}
 		break;
 	}
 
 	return (dp);
 }
 
 static void
 nd6_dad_starttimer(struct dadq *dp, int ticks)
 {
 	DADQ_WLOCK_ASSERT();
 
 	callout_reset(&dp->dad_timer_ch, ticks, nd6_dad_timer, dp);
 }
 
 static void
 nd6_dad_stoptimer(struct dadq *dp)
 {
 	callout_drain(&dp->dad_timer_ch);
 }
 
 static void
 nd6_dad_rele(struct dadq *dp)
 {
 	if (refcount_release(&dp->dad_refcnt)) {
 		KASSERT(!dp->dad_ondadq, ("dp %p still on DAD queue", dp));
 		ifa_free(dp->dad_ifa);
 		free(dp, M_IP6NDP);
 	}
 }
 
 void
 nd6_dad_init(void)
 {
 	DADQ_LOCK_INIT();
 	TAILQ_INIT(&V_dadq);
 }
 
 /*
  * Start Duplicate Address Detection (DAD) for specified interface address.
  */
 void
 nd6_dad_start(struct ifaddr *ifa, int delay)
 {
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct dadq *dp;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	KASSERT((ia->ia6_flags & IN6_IFF_TENTATIVE) != 0,
 	    ("starting DAD on non-tentative address %p", ifa));
 
 	/*
 	 * If we don't need DAD, don't do it.
 	 * There are several cases:
 	 * - DAD is disabled globally or on the interface
 	 * - the interface address is anycast
 	 */
 	if ((ia->ia6_flags & IN6_IFF_ANYCAST) != 0 ||
 	    V_ip6_dad_count == 0 ||
 	    (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_NO_DAD) != 0) {
 		ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 		return;
 	}
 	if ((ifa->ifa_ifp->if_flags & IFF_UP) == 0 ||
 	    (ifa->ifa_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    (ND_IFINFO(ifa->ifa_ifp)->flags & ND6_IFF_IFDISABLED) != 0)
 		return;
 
 	DADQ_WLOCK();
 	if ((dp = nd6_dad_find(ifa, NULL)) != NULL) {
 		/*
 		 * DAD is already in progress.  Let the existing entry
 		 * finish it.
 		 */
 		DADQ_WUNLOCK();
 		return;
 	}
 
 	dp = malloc(sizeof(*dp), M_IP6NDP, M_NOWAIT | M_ZERO);
 	if (dp == NULL) {
 		log(LOG_ERR, "nd6_dad_start: memory allocation failed for "
 			"%s(%s)\n",
 			ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		return;
 	}
 	callout_init_rw(&dp->dad_timer_ch, DADQ_LOCKPTR(),
 	    CALLOUT_RETURNUNLOCKED);
 #ifdef VIMAGE
 	dp->dad_vnet = curvnet;
 #endif
 	nd6log((LOG_DEBUG, "%s: starting DAD for %s\n", if_name(ifa->ifa_ifp),
 	    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 
 	/*
 	 * Send NS packet for DAD, ip6_dad_count times.
 	 * Note that we must delay the first transmission, if this is the
 	 * first packet to be sent from the interface after interface
 	 * (re)initialization.
 	 */
 	dp->dad_ifa = ifa;
 	ifa_ref(dp->dad_ifa);
 	dp->dad_count = V_ip6_dad_count;
 	dp->dad_ns_icount = dp->dad_na_icount = 0;
 	dp->dad_ns_ocount = dp->dad_ns_tcount = 0;
 	dp->dad_ns_lcount = dp->dad_loopbackprobe = 0;
 
 	/* Add this to the dadq and add a reference for the dadq. */
 	refcount_init(&dp->dad_refcnt, 1);
 	nd6_dad_add(dp);
 	nd6_dad_starttimer(dp, delay);
 	DADQ_WUNLOCK();
 }
 
 /*
  * terminate DAD unconditionally.  used for address removals.
  */
 void
 nd6_dad_stop(struct ifaddr *ifa)
 {
 	struct dadq *dp;
 
 	DADQ_WLOCK();
 	dp = nd6_dad_find(ifa, NULL);
 	if (dp == NULL) {
 		DADQ_WUNLOCK();
 		/* DAD wasn't started yet */
 		return;
 	}
 
 	/*
 	 * Acquire a temporary reference so that we can safely stop the callout.
 	 */
 	(void)refcount_acquire(&dp->dad_refcnt);
 	nd6_dad_del(dp);
 	DADQ_WUNLOCK();
 
 	nd6_dad_stoptimer(dp);
 	nd6_dad_rele(dp);
 }
 
 static void
 nd6_dad_timer(void *arg)
 {
 	struct dadq *dp = arg;
 	struct ifaddr *ifa = dp->dad_ifa;
 	struct ifnet *ifp = dp->dad_ifa->ifa_ifp;
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	char ip6buf[INET6_ADDRSTRLEN];
 	struct epoch_tracker et;
 
 	CURVNET_SET(dp->dad_vnet);
 	KASSERT(ia != NULL, ("DAD entry %p with no address", dp));
 
 	NET_EPOCH_ENTER(et);
 	if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) {
 		/* Do not need DAD for ifdisabled interface. */
 		log(LOG_ERR, "nd6_dad_timer: cancel DAD on %s because of "
 		    "ND6_IFF_IFDISABLED.\n", ifp->if_xname);
 		goto err;
 	}
 	if (ia->ia6_flags & IN6_IFF_DUPLICATED) {
 		log(LOG_ERR, "nd6_dad_timer: called with duplicated address "
 			"%s(%s)\n",
 			ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		goto err;
 	}
 	if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) {
 		log(LOG_ERR, "nd6_dad_timer: called with non-tentative address "
 			"%s(%s)\n",
 			ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 			ifa->ifa_ifp ? if_name(ifa->ifa_ifp) : "???");
 		goto err;
 	}
 
 	/* Stop DAD if the interface is down even after dad_maxtry attempts. */
 	if ((dp->dad_ns_tcount > V_dad_maxtry) &&
 	    (((ifp->if_flags & IFF_UP) == 0) ||
 	     ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0))) {
 		nd6log((LOG_INFO, "%s: could not run DAD "
 		    "because the interface was down or not running.\n",
 		    if_name(ifa->ifa_ifp)));
 		goto err;
 	}
 
 	/* Need more checks? */
 	if (dp->dad_ns_ocount < dp->dad_count) {
 		/*
 		 * We have more NS to go.  Send NS packet for DAD.
 		 */
 		nd6_dad_starttimer(dp,
 		    (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000);
 		nd6_dad_ns_output(dp);
 		goto done;
 	} else {
 		/*
 		 * We have transmitted sufficient number of DAD packets.
 		 * See what we've got.
 		 */
 		if (dp->dad_ns_icount > 0 || dp->dad_na_icount > 0) {
 			/* We've seen NS or NA, means DAD has failed. */
 			nd6_dad_duplicated(ifa, dp);
 		} else if (V_dad_enhanced != 0 &&
 		    dp->dad_ns_lcount > 0 &&
 		    dp->dad_ns_lcount > dp->dad_loopbackprobe) {
 			/*
 			 * Sec. 4.1 in RFC 7527 requires transmission of
 			 * additional probes until the loopback condition
 			 * becomes clear when a looped back probe is detected.
 			 */
 			log(LOG_ERR, "%s: a looped back NS message is "
 			    "detected during DAD for %s.  "
 			    "Another DAD probes are being sent.\n",
 			    if_name(ifa->ifa_ifp),
 			    ip6_sprintf(ip6buf, IFA_IN6(ifa)));
 			dp->dad_loopbackprobe = dp->dad_ns_lcount;
 			/*
 			 * Send an NS immediately and increase dad_count by
 			 * V_nd6_mmaxtries - 1.
 			 */
 			dp->dad_count =
 			    dp->dad_ns_ocount + V_nd6_mmaxtries - 1;
 			nd6_dad_starttimer(dp,
 			    (long)ND_IFINFO(ifa->ifa_ifp)->retrans * hz / 1000);
 			nd6_dad_ns_output(dp);
 			goto done;
 		} else {
 			/*
 			 * We are done with DAD.  No NA came, no NS came.
 			 * No duplicate address found.  Check IFDISABLED flag
 			 * again in case that it is changed between the
 			 * beginning of this function and here.
 			 */
 			if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) == 0)
 				ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 
 			nd6log((LOG_DEBUG,
 			    "%s: DAD complete for %s - no duplicates found\n",
 			    if_name(ifa->ifa_ifp),
 			    ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr)));
 			if (dp->dad_ns_lcount > 0)
 				log(LOG_ERR, "%s: DAD completed while "
 				    "a looped back NS message is detected "
 				    "during DAD for %s.\n",
 				    if_name(ifa->ifa_ifp),
 				    ip6_sprintf(ip6buf, IFA_IN6(ifa)));
 		}
 	}
 err:
 	nd6_dad_del(dp);
 	DADQ_WUNLOCK();
 done:
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 static void
 nd6_dad_duplicated(struct ifaddr *ifa, struct dadq *dp)
 {
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)ifa;
 	struct ifnet *ifp;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	log(LOG_ERR, "%s: DAD detected duplicate IPv6 address %s: "
 	    "NS in/out/loopback=%d/%d/%d, NA in=%d\n",
 	    if_name(ifa->ifa_ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr),
 	    dp->dad_ns_icount, dp->dad_ns_ocount, dp->dad_ns_lcount,
 	    dp->dad_na_icount);
 
 	ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
 	ia->ia6_flags |= IN6_IFF_DUPLICATED;
 
 	ifp = ifa->ifa_ifp;
 	log(LOG_ERR, "%s: DAD complete for %s - duplicate found\n",
 	    if_name(ifp), ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr));
 	log(LOG_ERR, "%s: manual intervention required\n",
 	    if_name(ifp));
 
 	/*
 	 * If the address is a link-local address formed from an interface
 	 * identifier based on the hardware address which is supposed to be
 	 * uniquely assigned (e.g., EUI-64 for an Ethernet interface), IP
 	 * operation on the interface SHOULD be disabled.
 	 * [RFC 4862, Section 5.4.5]
 	 */
 	if (IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr)) {
 		struct in6_addr in6;
 
 		/*
 		 * To avoid over-reaction, we only apply this logic when we are
 		 * very sure that hardware addresses are supposed to be unique.
 		 */
 		switch (ifp->if_type) {
 		case IFT_ETHER:
 		case IFT_ATM:
 		case IFT_IEEE1394:
 		case IFT_INFINIBAND:
 			in6 = ia->ia_addr.sin6_addr;
 			if (in6_get_hw_ifid(ifp, &in6) == 0 &&
 			    IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr, &in6)) {
 				ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
 				log(LOG_ERR, "%s: possible hardware address "
 				    "duplication detected, disable IPv6\n",
 				    if_name(ifp));
 			}
 			break;
 		}
 	}
 }
 
 /*
  * Transmit a neighbour solicitation for the purpose of DAD.  Returns with the
  * DAD queue unlocked.
  */
 static void
 nd6_dad_ns_output(struct dadq *dp)
 {
 	struct in6_ifaddr *ia = (struct in6_ifaddr *)dp->dad_ifa;
 	struct ifnet *ifp = dp->dad_ifa->ifa_ifp;
 	int i;
 
 	DADQ_WLOCK_ASSERT();
 
 	dp->dad_ns_tcount++;
 	if ((ifp->if_flags & IFF_UP) == 0) {
 		DADQ_WUNLOCK();
 		return;
 	}
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
 		DADQ_WUNLOCK();
 		return;
 	}
 
 	dp->dad_ns_ocount++;
 	if (V_dad_enhanced != 0) {
 		for (i = 0; i < ND_OPT_NONCE_LEN32; i++)
 			dp->dad_nonce[i] = arc4random();
 		/*
 		 * XXXHRS: Note that in the case that
 		 * DupAddrDetectTransmits > 1, multiple NS messages with
 		 * different nonces can be looped back in an unexpected
 		 * order.  The current implementation recognizes only
 		 * the latest nonce on the sender side.  Practically it
 		 * should work well in almost all cases.
 		 */
 	}
 	DADQ_WUNLOCK();
 	nd6_ns_output(ifp, NULL, NULL, &ia->ia_addr.sin6_addr,
 	    (uint8_t *)&dp->dad_nonce[0]);
 }
 
 static void
 nd6_dad_ns_input(struct ifaddr *ifa, struct nd_opt_nonce *ndopt_nonce)
 {
 	struct dadq *dp;
 
 	if (ifa == NULL)
 		panic("ifa == NULL in nd6_dad_ns_input");
 
 	/* Ignore Nonce option when Enhanced DAD is disabled. */
 	if (V_dad_enhanced == 0)
 		ndopt_nonce = NULL;
 	DADQ_RLOCK();
 	dp = nd6_dad_find(ifa, ndopt_nonce);
 	if (dp != NULL)
 		dp->dad_ns_icount++;
 	DADQ_RUNLOCK();
 }
 
 static void
 nd6_dad_na_input(struct ifaddr *ifa)
 {
 	struct dadq *dp;
 
 	if (ifa == NULL)
 		panic("ifa == NULL in nd6_dad_na_input");
 
 	DADQ_RLOCK();
 	dp = nd6_dad_find(ifa, NULL);
 	if (dp != NULL)
 		dp->dad_na_icount++;
 	DADQ_RUNLOCK();
 }
diff --git a/sys/netinet6/nd6_rtr.c b/sys/netinet6/nd6_rtr.c
index 793befc47aa5..dbdf12d310c6 100644
--- a/sys/netinet6/nd6_rtr.c
+++ b/sys/netinet6/nd6_rtr.c
@@ -1,2562 +1,2563 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: nd6_rtr.c,v 1.111 2001/04/27 01:37:15 jinmei Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/refcount.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/errno.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/if_dl.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/route/route_ctl.h>
 #include <net/radix.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <net/if_llatbl.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/scope6_var.h>
 
 static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *);
 static int prelist_update(struct nd_prefixctl *, struct nd_defrouter *,
     struct mbuf *, int);
 static int nd6_prefix_onlink(struct nd_prefix *);
 
 TAILQ_HEAD(nd6_drhead, nd_defrouter);
 VNET_DEFINE_STATIC(struct nd6_drhead, nd6_defrouter);
 #define	V_nd6_defrouter			VNET(nd6_defrouter)
 
 VNET_DECLARE(int, nd6_recalc_reachtm_interval);
 #define	V_nd6_recalc_reachtm_interval	VNET(nd6_recalc_reachtm_interval)
 
 VNET_DEFINE_STATIC(struct ifnet *, nd6_defifp);
 VNET_DEFINE(int, nd6_defifindex);
 #define	V_nd6_defifp			VNET(nd6_defifp)
 
 VNET_DEFINE(int, ip6_use_tempaddr) = 0;
 
 VNET_DEFINE(int, ip6_desync_factor);
 VNET_DEFINE(u_int32_t, ip6_temp_preferred_lifetime) = DEF_TEMP_PREFERRED_LIFETIME;
 VNET_DEFINE(u_int32_t, ip6_temp_valid_lifetime) = DEF_TEMP_VALID_LIFETIME;
 
 VNET_DEFINE(int, ip6_temp_regen_advance) = TEMPADDR_REGEN_ADVANCE;
 
 #ifdef EXPERIMENTAL
 VNET_DEFINE(int, nd6_ignore_ipv6_only_ra) = 1;
 #endif
 
 SYSCTL_DECL(_net_inet6_icmp6);
 
 /* RTPREF_MEDIUM has to be 0! */
 #define RTPREF_HIGH	1
 #define RTPREF_MEDIUM	0
 #define RTPREF_LOW	(-1)
 #define RTPREF_RESERVED	(-2)
 #define RTPREF_INVALID	(-3)	/* internal */
 
 static void
 defrouter_ref(struct nd_defrouter *dr)
 {
 
 	refcount_acquire(&dr->refcnt);
 }
 
 void
 defrouter_rele(struct nd_defrouter *dr)
 {
 
 	if (refcount_release(&dr->refcnt))
 		free(dr, M_IP6NDP);
 }
 
 /*
  * Remove a router from the global list and optionally stash it in a
  * caller-supplied queue.
  */
 static void
 defrouter_unlink(struct nd_defrouter *dr, struct nd6_drhead *drq)
 {
 
 	ND6_WLOCK_ASSERT();
 
 	TAILQ_REMOVE(&V_nd6_defrouter, dr, dr_entry);
 	V_nd6_list_genid++;
 	if (drq != NULL)
 		TAILQ_INSERT_TAIL(drq, dr, dr_entry);
 }
 
 /*
  * Receive Router Solicitation Message - just for routers.
  * Router solicitation/advertisement is mostly managed by userland program
  * (rtadvd) so here we have no function like nd6_ra_output().
  *
  * Based on RFC 2461
  */
 void
 nd6_rs_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet *ifp;
 	struct ip6_hdr *ip6;
 	struct nd_router_solicit *nd_rs;
 	struct in6_addr saddr6;
 	union nd_opts ndopts;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 	char *lladdr;
 	int lladdrlen;
 
 	ifp = m->m_pkthdr.rcvif;
 
 	/*
 	 * Accept RS only when V_ip6_forwarding=1 and the interface has
 	 * no ND6_IFF_ACCEPT_RTADV.
 	 */
 	if (!V_ip6_forwarding || ND_IFINFO(ifp)->flags & ND6_IFF_ACCEPT_RTADV)
 		goto freeit;
 
 	/* RFC 6980: Nodes MUST silently ignore fragments */   
 	if(m->m_flags & M_FRAGMENTED)
 		goto freeit;
 
 	/* Sanity checks */
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (__predict_false(ip6->ip6_hlim != 255)) {
 		ICMP6STAT_INC(icp6s_invlhlim);
 		nd6log((LOG_ERR,
 		    "%s: invalid hlim (%d) from %s to %s on %s\n", __func__,
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	/*
 	 * Don't update the neighbor cache, if src = ::.
 	 * This indicates that the src has no IP address assigned yet.
 	 */
 	saddr6 = ip6->ip6_src;
 	if (IN6_IS_ADDR_UNSPECIFIED(&saddr6))
 		goto freeit;
 
 	if (m->m_len < off + icmp6len) {
 		m = m_pullup(m, off + icmp6len);
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	nd_rs = (struct nd_router_solicit *)((caddr_t)ip6 + off);
 
 	icmp6len -= sizeof(*nd_rs);
 	nd6_option_init(nd_rs + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "%s: invalid ND option, ignored\n", __func__));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	lladdr = NULL;
 	lladdrlen = 0;
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "%s: lladdrlen mismatch for %s (if %d, RS packet %d)\n",
 		    __func__, ip6_sprintf(ip6bufs, &saddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr, lladdrlen, ND_ROUTER_SOLICIT, 0);
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	ICMP6STAT_INC(icp6s_badrs);
 	m_freem(m);
 }
 
 #ifdef EXPERIMENTAL
 /*
  * An initial update routine for draft-ietf-6man-ipv6only-flag.
  * We need to iterate over all default routers for the given
  * interface to see whether they are all advertising the "S"
  * (IPv6-Only) flag.  If they do set, otherwise unset, the
  * interface flag we later use to filter on.
  */
 static void
 defrtr_ipv6_only_ifp(struct ifnet *ifp)
 {
 	struct nd_defrouter *dr;
 	bool ipv6_only, ipv6_only_old;
 #ifdef INET
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	bool has_ipv4_addr;
 #endif
 
 	if (V_nd6_ignore_ipv6_only_ra != 0)
 		return;
 
 	ipv6_only = true;
 	ND6_RLOCK();
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry)
 		if (dr->ifp == ifp &&
 		    (dr->raflags & ND_RA_FLAG_IPV6_ONLY) == 0)
 			ipv6_only = false;
 	ND6_RUNLOCK();
 
 	IF_AFDATA_WLOCK(ifp);
 	ipv6_only_old = ND_IFINFO(ifp)->flags & ND6_IFF_IPV6_ONLY;
 	IF_AFDATA_WUNLOCK(ifp);
 
 	/* If nothing changed, we have an early exit. */
 	if (ipv6_only == ipv6_only_old)
 		return;
 
 #ifdef INET
 	/*
 	 * Should we want to set the IPV6-ONLY flag, check if the
 	 * interface has a non-0/0 and non-link-local IPv4 address
 	 * configured on it.  If it has we will assume working
 	 * IPv4 operations and will clear the interface flag.
 	 */
 	has_ipv4_addr = false;
 	if (ipv6_only) {
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			if (in_canforward(
 			    satosin(ifa->ifa_addr)->sin_addr)) {
 				has_ipv4_addr = true;
 				break;
 			}
 		}
 		NET_EPOCH_EXIT(et);
 	}
 	if (ipv6_only && has_ipv4_addr) {
 		log(LOG_NOTICE, "%s rcvd RA w/ IPv6-Only flag set but has IPv4 "
 		    "configured, ignoring IPv6-Only flag.\n", ifp->if_xname);
 		ipv6_only = false;
 	}
 #endif
 
 	IF_AFDATA_WLOCK(ifp);
 	if (ipv6_only)
 		ND_IFINFO(ifp)->flags |= ND6_IFF_IPV6_ONLY;
 	else
 		ND_IFINFO(ifp)->flags &= ~ND6_IFF_IPV6_ONLY;
 	IF_AFDATA_WUNLOCK(ifp);
 
 #ifdef notyet
 	/* Send notification of flag change. */
 #endif
 }
 
 static void
 defrtr_ipv6_only_ipf_down(struct ifnet *ifp)
 {
 
 	IF_AFDATA_WLOCK(ifp);
 	ND_IFINFO(ifp)->flags &= ~ND6_IFF_IPV6_ONLY;
 	IF_AFDATA_WUNLOCK(ifp);
 }
 #endif	/* EXPERIMENTAL */
 
 void
 nd6_ifnet_link_event(void *arg __unused, struct ifnet *ifp, int linkstate)
 {
 
 	/*
 	 * XXX-BZ we might want to trigger re-evaluation of our default router
 	 * availability. E.g., on link down the default router might be
 	 * unreachable but a different interface might still have connectivity.
 	 */
 
 #ifdef EXPERIMENTAL
 	if (linkstate == LINK_STATE_DOWN)
 		defrtr_ipv6_only_ipf_down(ifp);
 #endif
 }
 
 /*
  * Receive Router Advertisement Message.
  *
  * Based on RFC 2461
  * TODO: on-link bit on prefix information
  * TODO: ND_RA_FLAG_{OTHER,MANAGED} processing
  */
 void
 nd6_ra_input(struct mbuf *m, int off, int icmp6len)
 {
 	struct ifnet *ifp;
 	struct nd_ifinfo *ndi;
 	struct ip6_hdr *ip6;
 	struct nd_router_advert *nd_ra;
 	struct in6_addr saddr6;
 	struct nd_defrouter *dr;
 	union nd_opts ndopts;
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 	int mcast;
 
 	/*
 	 * We only accept RAs only when the per-interface flag
 	 * ND6_IFF_ACCEPT_RTADV is on the receiving interface.
 	 */
 	ifp = m->m_pkthdr.rcvif;
 	ndi = ND_IFINFO(ifp);
 	if (!(ndi->flags & ND6_IFF_ACCEPT_RTADV))
 		goto freeit;
 
 	/* RFC 6980: Nodes MUST silently ignore fragments */
 	if(m->m_flags & M_FRAGMENTED)
 		goto freeit;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (__predict_false(ip6->ip6_hlim != 255)) {
 		ICMP6STAT_INC(icp6s_invlhlim);
 		nd6log((LOG_ERR,
 		    "%s: invalid hlim (%d) from %s to %s on %s\n", __func__,
 		    ip6->ip6_hlim, ip6_sprintf(ip6bufs, &ip6->ip6_src),
 		    ip6_sprintf(ip6bufd, &ip6->ip6_dst), if_name(ifp)));
 		goto bad;
 	}
 
 	saddr6 = ip6->ip6_src;
 	if (!IN6_IS_ADDR_LINKLOCAL(&saddr6)) {
 		nd6log((LOG_ERR,
 		    "%s: src %s is not link-local\n", __func__,
 		    ip6_sprintf(ip6bufs, &saddr6)));
 		goto bad;
 	}
 
 	if (m->m_len < off + icmp6len) {
 		m = m_pullup(m, off + icmp6len);
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			return;
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	nd_ra = (struct nd_router_advert *)((caddr_t)ip6 + off);
 
 	icmp6len -= sizeof(*nd_ra);
 	nd6_option_init(nd_ra + 1, icmp6len, &ndopts);
 	if (nd6_options(&ndopts) < 0) {
 		nd6log((LOG_INFO,
 		    "%s: invalid ND option, ignored\n", __func__));
 		/* nd6_options have incremented stats */
 		goto freeit;
 	}
 
 	mcast = 0;
 	dr = NULL;
     {
 	struct nd_defrouter dr0;
 	u_int32_t advreachable = nd_ra->nd_ra_reachable;
 
 	/* remember if this is a multicasted advertisement */
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
 		mcast = 1;
 
 	bzero(&dr0, sizeof(dr0));
 	dr0.rtaddr = saddr6;
 	dr0.raflags = nd_ra->nd_ra_flags_reserved;
 	/*
 	 * Effectively-disable routes from RA messages when
 	 * ND6_IFF_NO_RADR enabled on the receiving interface or
 	 * (ip6.forwarding == 1 && ip6.rfc6204w3 != 1).
 	 */
 	if (ndi->flags & ND6_IFF_NO_RADR)
 		dr0.rtlifetime = 0;
 	else if (V_ip6_forwarding && !V_ip6_rfc6204w3)
 		dr0.rtlifetime = 0;
 	else
 		dr0.rtlifetime = ntohs(nd_ra->nd_ra_router_lifetime);
 	dr0.expire = time_uptime + dr0.rtlifetime;
 	dr0.ifp = ifp;
 	/* unspecified or not? (RFC 2461 6.3.4) */
 	if (advreachable) {
 		advreachable = ntohl(advreachable);
 		if (advreachable <= MAX_REACHABLE_TIME &&
 		    ndi->basereachable != advreachable) {
 			ndi->basereachable = advreachable;
 			ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable);
 			ndi->recalctm = V_nd6_recalc_reachtm_interval; /* reset */
 		}
 	}
 	if (nd_ra->nd_ra_retransmit)
 		ndi->retrans = ntohl(nd_ra->nd_ra_retransmit);
 	if (nd_ra->nd_ra_curhoplimit) {
 		if (ndi->chlim < nd_ra->nd_ra_curhoplimit)
 			ndi->chlim = nd_ra->nd_ra_curhoplimit;
 		else if (ndi->chlim != nd_ra->nd_ra_curhoplimit) {
 			log(LOG_ERR, "RA with a lower CurHopLimit sent from "
 			    "%s on %s (current = %d, received = %d). "
 			    "Ignored.\n", ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    if_name(ifp), ndi->chlim, nd_ra->nd_ra_curhoplimit);
 		}
 	}
 	dr = defrtrlist_update(&dr0);
 #ifdef EXPERIMENTAL
 	defrtr_ipv6_only_ifp(ifp);
 #endif
     }
 
 	/*
 	 * prefix
 	 */
 	if (ndopts.nd_opts_pi) {
 		struct nd_opt_hdr *pt;
 		struct nd_opt_prefix_info *pi = NULL;
 		struct nd_prefixctl pr;
 
 		for (pt = (struct nd_opt_hdr *)ndopts.nd_opts_pi;
 		     pt <= (struct nd_opt_hdr *)ndopts.nd_opts_pi_end;
 		     pt = (struct nd_opt_hdr *)((caddr_t)pt +
 						(pt->nd_opt_len << 3))) {
 			if (pt->nd_opt_type != ND_OPT_PREFIX_INFORMATION)
 				continue;
 			pi = (struct nd_opt_prefix_info *)pt;
 
 			if (pi->nd_opt_pi_len != 4) {
 				nd6log((LOG_INFO,
 				    "%s: invalid option len %d for prefix "
 				    "information option, ignored\n", __func__,
 				    pi->nd_opt_pi_len));
 				continue;
 			}
 
 			if (128 < pi->nd_opt_pi_prefix_len) {
 				nd6log((LOG_INFO,
 				    "%s: invalid prefix len %d for prefix "
 				    "information option, ignored\n", __func__,
 				    pi->nd_opt_pi_prefix_len));
 				continue;
 			}
 
 			if (IN6_IS_ADDR_MULTICAST(&pi->nd_opt_pi_prefix)
 			 || IN6_IS_ADDR_LINKLOCAL(&pi->nd_opt_pi_prefix)) {
 				nd6log((LOG_INFO,
 				    "%s: invalid prefix %s, ignored\n",
 				    __func__, ip6_sprintf(ip6bufs,
 					&pi->nd_opt_pi_prefix)));
 				continue;
 			}
 
 			bzero(&pr, sizeof(pr));
 			pr.ndpr_prefix.sin6_family = AF_INET6;
 			pr.ndpr_prefix.sin6_len = sizeof(pr.ndpr_prefix);
 			pr.ndpr_prefix.sin6_addr = pi->nd_opt_pi_prefix;
 			pr.ndpr_ifp = (struct ifnet *)m->m_pkthdr.rcvif;
 
 			pr.ndpr_raf_onlink = (pi->nd_opt_pi_flags_reserved &
 			    ND_OPT_PI_FLAG_ONLINK) ? 1 : 0;
 			pr.ndpr_raf_auto = (pi->nd_opt_pi_flags_reserved &
 			    ND_OPT_PI_FLAG_AUTO) ? 1 : 0;
 			pr.ndpr_raf_ra_derived = 1;
 			pr.ndpr_plen = pi->nd_opt_pi_prefix_len;
 			pr.ndpr_vltime = ntohl(pi->nd_opt_pi_valid_time);
 			pr.ndpr_pltime = ntohl(pi->nd_opt_pi_preferred_time);
 			(void)prelist_update(&pr, dr, m, mcast);
 		}
 	}
 	if (dr != NULL) {
 		defrouter_rele(dr);
 		dr = NULL;
 	}
 
 	/*
 	 * MTU
 	 */
 	if (ndopts.nd_opts_mtu && ndopts.nd_opts_mtu->nd_opt_mtu_len == 1) {
 		u_long mtu;
 		u_long maxmtu;
 
 		mtu = (u_long)ntohl(ndopts.nd_opts_mtu->nd_opt_mtu_mtu);
 
 		/* lower bound */
 		if (mtu < IPV6_MMTU) {
 			nd6log((LOG_INFO, "%s: bogus mtu option mtu=%lu sent "
 			    "from %s, ignoring\n", __func__,
 			    mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src)));
 			goto skip;
 		}
 
 		/* upper bound */
 		maxmtu = (ndi->maxmtu && ndi->maxmtu < ifp->if_mtu)
 		    ? ndi->maxmtu : ifp->if_mtu;
 		if (mtu <= maxmtu) {
 			int change = (ndi->linkmtu != mtu);
 
 			ndi->linkmtu = mtu;
 			if (change) {
 				/* in6_maxmtu may change */
 				in6_setmaxmtu();
 				rt_updatemtu(ifp);
 			}
 		} else {
 			nd6log((LOG_INFO, "%s: bogus mtu=%lu sent from %s; "
 			    "exceeds maxmtu %lu, ignoring\n", __func__,
 			    mtu, ip6_sprintf(ip6bufs, &ip6->ip6_src), maxmtu));
 		}
 	}
 
  skip:
 
 	/*
 	 * Source link layer address
 	 */
     {
 	char *lladdr = NULL;
 	int lladdrlen = 0;
 
 	if (ndopts.nd_opts_src_lladdr) {
 		lladdr = (char *)(ndopts.nd_opts_src_lladdr + 1);
 		lladdrlen = ndopts.nd_opts_src_lladdr->nd_opt_len << 3;
 	}
 
 	if (lladdr && ((ifp->if_addrlen + 2 + 7) & ~7) != lladdrlen) {
 		nd6log((LOG_INFO,
 		    "%s: lladdrlen mismatch for %s (if %d, RA packet %d)\n",
 		    __func__, ip6_sprintf(ip6bufs, &saddr6),
 		    ifp->if_addrlen, lladdrlen - 2));
 		goto bad;
 	}
 
 	nd6_cache_lladdr(ifp, &saddr6, lladdr,
 	    lladdrlen, ND_ROUTER_ADVERT, 0);
 
 	/*
 	 * Installing a link-layer address might change the state of the
 	 * router's neighbor cache, which might also affect our on-link
 	 * detection of adveritsed prefixes.
 	 */
 	pfxlist_onlink_check();
     }
 
  freeit:
 	m_freem(m);
 	return;
 
  bad:
 	ICMP6STAT_INC(icp6s_badra);
 	m_freem(m);
 }
 
 /* PFXRTR */
 static struct nd_pfxrouter *
 pfxrtr_lookup(struct nd_prefix *pr, struct nd_defrouter *dr)
 {
 	struct nd_pfxrouter *search;
 
 	ND6_LOCK_ASSERT();
 
 	LIST_FOREACH(search, &pr->ndpr_advrtrs, pfr_entry) {
 		if (search->router == dr)
 			break;
 	}
 	return (search);
 }
 
 static void
 pfxrtr_add(struct nd_prefix *pr, struct nd_defrouter *dr)
 {
 	struct nd_pfxrouter *new;
 	bool update;
 
 	ND6_UNLOCK_ASSERT();
 
 	ND6_RLOCK();
 	if (pfxrtr_lookup(pr, dr) != NULL) {
 		ND6_RUNLOCK();
 		return;
 	}
 	ND6_RUNLOCK();
 
 	new = malloc(sizeof(*new), M_IP6NDP, M_NOWAIT | M_ZERO);
 	if (new == NULL)
 		return;
 	defrouter_ref(dr);
 	new->router = dr;
 
 	ND6_WLOCK();
 	if (pfxrtr_lookup(pr, dr) == NULL) {
 		LIST_INSERT_HEAD(&pr->ndpr_advrtrs, new, pfr_entry);
 		update = true;
 	} else {
 		/* We lost a race to add the reference. */
 		defrouter_rele(dr);
 		free(new, M_IP6NDP);
 		update = false;
 	}
 	ND6_WUNLOCK();
 
 	if (update)
 		pfxlist_onlink_check();
 }
 
 static void
 pfxrtr_del(struct nd_pfxrouter *pfr)
 {
 
 	ND6_WLOCK_ASSERT();
 
 	LIST_REMOVE(pfr, pfr_entry);
 	defrouter_rele(pfr->router);
 	free(pfr, M_IP6NDP);
 }
 
 /* Default router list processing sub routines. */
 static void
 defrouter_addreq(struct nd_defrouter *new)
 {
 	uint32_t fibnum = new->ifp->if_fib;
 	struct rib_cmd_info rc = {};
 	int error = 0;
 
 	NET_EPOCH_ASSERT();
 
 	struct sockaddr_in6 gw = {
 		.sin6_family = AF_INET6,
 		.sin6_len = sizeof(struct sockaddr_in6),
 		.sin6_addr = new->rtaddr,
 	};
 
 	error = rib_add_default_route(fibnum, AF_INET6, new->ifp,
 	    (struct sockaddr *)&gw, &rc);
 
 	if (error == 0) {
 		struct nhop_object *nh = nhop_select_func(rc.rc_nh_new, 0);
 		rt_routemsg(RTM_ADD, rc.rc_rt, nh, fibnum);
 		new->installed = 1;
 	}
 }
 
 /*
  * Remove the default route for a given router.
  * This is just a subroutine function for defrouter_select_fib(), and
  * should not be called from anywhere else.
  */
 static void
 defrouter_delreq(struct nd_defrouter *dr)
 {
 	uint32_t fibnum = dr->ifp->if_fib;
 	struct epoch_tracker et;
 	struct rib_cmd_info rc;
 	int error;
 
 	struct sockaddr_in6 dst = {
 		.sin6_family = AF_INET6,
 		.sin6_len = sizeof(struct sockaddr_in6),
 	};
 
 	struct sockaddr_in6 gw = {
 		.sin6_family = AF_INET6,
 		.sin6_len = sizeof(struct sockaddr_in6),
 		.sin6_addr = dr->rtaddr,
 	};
 
 	NET_EPOCH_ENTER(et);
 	error = rib_del_route_px(fibnum, (struct sockaddr *)&dst, 0,
 		    rib_match_gw, (struct sockaddr *)&gw, 0, &rc);
 	if (error == 0) {
 		struct nhop_object *nh = nhop_select_func(rc.rc_nh_old, 0);
 		rt_routemsg(RTM_DELETE, rc.rc_rt, nh, fibnum);
 	}
 	NET_EPOCH_EXIT(et);
 
 	dr->installed = 0;
 }
 
 static void
 defrouter_del(struct nd_defrouter *dr)
 {
 	struct nd_defrouter *deldr = NULL;
 	struct nd_prefix *pr;
 	struct nd_pfxrouter *pfxrtr;
 
 	ND6_UNLOCK_ASSERT();
 
 	/*
 	 * Flush all the routing table entries that use the router
 	 * as a next hop.
 	 */
 	if (ND_IFINFO(dr->ifp)->flags & ND6_IFF_ACCEPT_RTADV)
 		rt6_flush(&dr->rtaddr, dr->ifp);
 
 #ifdef EXPERIMENTAL
 	defrtr_ipv6_only_ifp(dr->ifp);
 #endif
 
 	if (dr->installed) {
 		deldr = dr;
 		defrouter_delreq(dr);
 	}
 
 	/*
 	 * Also delete all the pointers to the router in each prefix lists.
 	 */
 	ND6_WLOCK();
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		if ((pfxrtr = pfxrtr_lookup(pr, dr)) != NULL)
 			pfxrtr_del(pfxrtr);
 	}
 	ND6_WUNLOCK();
 
 	pfxlist_onlink_check();
 
 	/*
 	 * If the router is the primary one, choose a new one.
 	 * Note that defrouter_select_fib() will remove the current
          * gateway from the routing table.
 	 */
 	if (deldr)
 		defrouter_select_fib(deldr->ifp->if_fib);
 
 	/*
 	 * Release the list reference.
 	 */
 	defrouter_rele(dr);
 }
 
 struct nd_defrouter *
 defrouter_lookup_locked(const struct in6_addr *addr, struct ifnet *ifp)
 {
 	struct nd_defrouter *dr;
 
 	ND6_LOCK_ASSERT();
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry)
 		if (dr->ifp == ifp && IN6_ARE_ADDR_EQUAL(addr, &dr->rtaddr)) {
 			defrouter_ref(dr);
 			return (dr);
 		}
 	return (NULL);
 }
 
 struct nd_defrouter *
 defrouter_lookup(const struct in6_addr *addr, struct ifnet *ifp)
 {
 	struct nd_defrouter *dr;
 
 	ND6_RLOCK();
 	dr = defrouter_lookup_locked(addr, ifp);
 	ND6_RUNLOCK();
 	return (dr);
 }
 
 /*
  * Remove all default routes from default router list.
  */
 void
 defrouter_reset(void)
 {
 	struct nd_defrouter *dr, **dra;
 	int count, i;
 
 	count = i = 0;
 
 	/*
 	 * We can't delete routes with the ND lock held, so make a copy of the
 	 * current default router list and use that when deleting routes.
 	 */
 	ND6_RLOCK();
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry)
 		count++;
 	ND6_RUNLOCK();
 
 	dra = malloc(count * sizeof(*dra), M_TEMP, M_WAITOK | M_ZERO);
 
 	ND6_RLOCK();
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) {
 		if (i == count)
 			break;
 		defrouter_ref(dr);
 		dra[i++] = dr;
 	}
 	ND6_RUNLOCK();
 
 	for (i = 0; i < count && dra[i] != NULL; i++) {
 		defrouter_delreq(dra[i]);
 		defrouter_rele(dra[i]);
 	}
 	free(dra, M_TEMP);
 
 	/*
 	 * XXX should we also nuke any default routers in the kernel, by
 	 * going through them by rtalloc1()?
 	 */
 }
 
 /*
  * Look up a matching default router list entry and remove it. Returns true if a
  * matching entry was found, false otherwise.
  */
 bool
 defrouter_remove(struct in6_addr *addr, struct ifnet *ifp)
 {
 	struct nd_defrouter *dr;
 
 	ND6_WLOCK();
 	dr = defrouter_lookup_locked(addr, ifp);
 	if (dr == NULL) {
 		ND6_WUNLOCK();
 		return (false);
 	}
 
 	defrouter_unlink(dr, NULL);
 	ND6_WUNLOCK();
 	defrouter_del(dr);
 	defrouter_rele(dr);
 	return (true);
 }
 
 /*
  * for default router selection
  * regards router-preference field as a 2-bit signed integer
  */
 static int
 rtpref(struct nd_defrouter *dr)
 {
 	switch (dr->raflags & ND_RA_FLAG_RTPREF_MASK) {
 	case ND_RA_FLAG_RTPREF_HIGH:
 		return (RTPREF_HIGH);
 	case ND_RA_FLAG_RTPREF_MEDIUM:
 	case ND_RA_FLAG_RTPREF_RSV:
 		return (RTPREF_MEDIUM);
 	case ND_RA_FLAG_RTPREF_LOW:
 		return (RTPREF_LOW);
 	default:
 		/*
 		 * This case should never happen.  If it did, it would mean a
 		 * serious bug of kernel internal.  We thus always bark here.
 		 * Or, can we even panic?
 		 */
 		log(LOG_ERR, "rtpref: impossible RA flag %x\n", dr->raflags);
 		return (RTPREF_INVALID);
 	}
 	/* NOTREACHED */
 }
 
 static bool
 is_dr_reachable(const struct nd_defrouter *dr) {
 	struct llentry *ln = NULL;
 
 	ln = nd6_lookup(&dr->rtaddr, LLE_SF(AF_INET6, 0), dr->ifp);
 	if (ln == NULL)
 		return (false);
 	bool reachable = ND6_IS_LLINFO_PROBREACH(ln);
 	LLE_RUNLOCK(ln);
 	return reachable;
 }
 
 /*
  * Default Router Selection according to Section 6.3.6 of RFC 2461 and
  * draft-ietf-ipngwg-router-selection:
  * 1) Routers that are reachable or probably reachable should be preferred.
  *    If we have more than one (probably) reachable router, prefer ones
  *    with the highest router preference.
  * 2) When no routers on the list are known to be reachable or
  *    probably reachable, routers SHOULD be selected in a round-robin
  *    fashion, regardless of router preference values.
  * 3) If the Default Router List is empty, assume that all
  *    destinations are on-link.
  *
  * We assume nd_defrouter is sorted by router preference value.
  * Since the code below covers both with and without router preference cases,
  * we do not need to classify the cases by ifdef.
  *
  * At this moment, we do not try to install more than one default router,
  * even when the multipath routing is available, because we're not sure about
  * the benefits for stub hosts comparing to the risk of making the code
  * complicated and the possibility of introducing bugs.
  *
  * We maintain a single list of routers for multiple FIBs, only considering one
  * at a time based on the receiving interface's FIB. If @fibnum is RT_ALL_FIBS,
  * we do the whole thing multiple times.
  */
 void
 defrouter_select_fib(int fibnum)
 {
 	struct epoch_tracker et;
 	struct nd_defrouter *dr, *selected_dr, *installed_dr;
 
 	if (fibnum == RT_ALL_FIBS) {
 		for (fibnum = 0; fibnum < rt_numfibs; fibnum++) {
 			defrouter_select_fib(fibnum);
 		}
 		return;
 	}
 
 	ND6_RLOCK();
 	/*
 	 * Let's handle easy case (3) first:
 	 * If default router list is empty, there's nothing to be done.
 	 */
 	if (TAILQ_EMPTY(&V_nd6_defrouter)) {
 		ND6_RUNLOCK();
 		return;
 	}
 
 	/*
 	 * Search for a (probably) reachable router from the list.
 	 * We just pick up the first reachable one (if any), assuming that
 	 * the ordering rule of the list described in defrtrlist_update().
 	 */
 	selected_dr = installed_dr = NULL;
 	NET_EPOCH_ENTER(et);
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) {
 		if (dr->ifp->if_fib != fibnum)
 			continue;
 
 		if (selected_dr == NULL && is_dr_reachable(dr)) {
 			selected_dr = dr;
 			defrouter_ref(selected_dr);
 		}
 
 		if (dr->installed) {
 			if (installed_dr == NULL) {
 				installed_dr = dr;
 				defrouter_ref(installed_dr);
 			} else {
 				/*
 				 * this should not happen.
 				 * warn for diagnosis.
 				 */
 				log(LOG_ERR, "defrouter_select_fib: more than "
 				             "one router is installed\n");
 			}
 		}
 	}
 
 	/*
 	 * If none of the default routers was found to be reachable,
 	 * round-robin the list regardless of preference.
 	 * Otherwise, if we have an installed router, check if the selected
 	 * (reachable) router should really be preferred to the installed one.
 	 * We only prefer the new router when the old one is not reachable
 	 * or when the new one has a really higher preference value.
 	 */
 	if (selected_dr == NULL) {
 		if (installed_dr == NULL ||
 		    TAILQ_NEXT(installed_dr, dr_entry) == NULL)
 			dr = TAILQ_FIRST(&V_nd6_defrouter);
 		else
 			dr = TAILQ_NEXT(installed_dr, dr_entry);
 
 		/* Ensure we select a router for this FIB. */
 		TAILQ_FOREACH_FROM(dr, &V_nd6_defrouter, dr_entry) {
 			if (dr->ifp->if_fib == fibnum) {
 				selected_dr = dr;
 				defrouter_ref(selected_dr);
 				break;
 			}
 		}
 	} else if (installed_dr != NULL) {
 		if (is_dr_reachable(installed_dr) &&
 		    rtpref(selected_dr) <= rtpref(installed_dr)) {
 			defrouter_rele(selected_dr);
 			selected_dr = installed_dr;
 		}
 	}
 	ND6_RUNLOCK();
 
 	/*
 	 * If we selected a router for this FIB and it's different
 	 * than the installed one, remove the installed router and
 	 * install the selected one in its place.
 	 */
 	if (installed_dr != selected_dr) {
 		if (installed_dr != NULL) {
 			defrouter_delreq(installed_dr);
 			defrouter_rele(installed_dr);
 		}
 		if (selected_dr != NULL)
 			defrouter_addreq(selected_dr);
 	}
 	if (selected_dr != NULL)
 		defrouter_rele(selected_dr);
 	NET_EPOCH_EXIT(et);
 }
 
 static struct nd_defrouter *
 defrtrlist_update(struct nd_defrouter *new)
 {
 	struct nd_defrouter *dr, *n;
 	uint64_t genid;
 	int oldpref;
 	bool writelocked;
 
 	if (new->rtlifetime == 0) {
 		defrouter_remove(&new->rtaddr, new->ifp);
 		return (NULL);
 	}
 
 	ND6_RLOCK();
 	writelocked = false;
 restart:
 	dr = defrouter_lookup_locked(&new->rtaddr, new->ifp);
 	if (dr != NULL) {
 		oldpref = rtpref(dr);
 
 		/* override */
 		dr->raflags = new->raflags; /* XXX flag check */
 		dr->rtlifetime = new->rtlifetime;
 		dr->expire = new->expire;
 
 		/*
 		 * If the preference does not change, there's no need
 		 * to sort the entries. Also make sure the selected
 		 * router is still installed in the kernel.
 		 */
 		if (dr->installed && rtpref(new) == oldpref) {
 			if (writelocked)
 				ND6_WUNLOCK();
 			else
 				ND6_RUNLOCK();
 			return (dr);
 		}
 	}
 
 	/*
 	 * The router needs to be reinserted into the default router
 	 * list, so upgrade to a write lock. If that fails and the list
 	 * has potentially changed while the lock was dropped, we'll
 	 * redo the lookup with the write lock held.
 	 */
 	if (!writelocked) {
 		writelocked = true;
 		if (!ND6_TRY_UPGRADE()) {
 			genid = V_nd6_list_genid;
 			ND6_RUNLOCK();
 			ND6_WLOCK();
 			if (genid != V_nd6_list_genid)
 				goto restart;
 		}
 	}
 
 	if (dr != NULL) {
 		/*
 		 * The preferred router may have changed, so relocate this
 		 * router.
 		 */
 		TAILQ_REMOVE(&V_nd6_defrouter, dr, dr_entry);
 		n = dr;
 	} else {
 		n = malloc(sizeof(*n), M_IP6NDP, M_NOWAIT | M_ZERO);
 		if (n == NULL) {
 			ND6_WUNLOCK();
 			return (NULL);
 		}
 		memcpy(n, new, sizeof(*n));
 		/* Initialize with an extra reference for the caller. */
 		refcount_init(&n->refcnt, 2);
 	}
 
 	/*
 	 * Insert the new router in the Default Router List;
 	 * The Default Router List should be in the descending order
 	 * of router-preferece.  Routers with the same preference are
 	 * sorted in the arriving time order.
 	 */
 
 	/* insert at the end of the group */
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) {
 		if (rtpref(n) > rtpref(dr))
 			break;
 	}
 	if (dr != NULL)
 		TAILQ_INSERT_BEFORE(dr, n, dr_entry);
 	else
 		TAILQ_INSERT_TAIL(&V_nd6_defrouter, n, dr_entry);
 	V_nd6_list_genid++;
 	ND6_WUNLOCK();
 
 	defrouter_select_fib(new->ifp->if_fib);
 
 	return (n);
 }
 
 static int
 in6_init_prefix_ltimes(struct nd_prefix *ndpr)
 {
 	if (ndpr->ndpr_pltime == ND6_INFINITE_LIFETIME)
 		ndpr->ndpr_preferred = 0;
 	else
 		ndpr->ndpr_preferred = time_uptime + ndpr->ndpr_pltime;
 	if (ndpr->ndpr_vltime == ND6_INFINITE_LIFETIME)
 		ndpr->ndpr_expire = 0;
 	else
 		ndpr->ndpr_expire = time_uptime + ndpr->ndpr_vltime;
 
 	return 0;
 }
 
 static void
 in6_init_address_ltimes(struct nd_prefix *new, struct in6_addrlifetime *lt6)
 {
 	/* init ia6t_expire */
 	if (lt6->ia6t_vltime == ND6_INFINITE_LIFETIME)
 		lt6->ia6t_expire = 0;
 	else {
 		lt6->ia6t_expire = time_uptime;
 		lt6->ia6t_expire += lt6->ia6t_vltime;
 	}
 
 	/* init ia6t_preferred */
 	if (lt6->ia6t_pltime == ND6_INFINITE_LIFETIME)
 		lt6->ia6t_preferred = 0;
 	else {
 		lt6->ia6t_preferred = time_uptime;
 		lt6->ia6t_preferred += lt6->ia6t_pltime;
 	}
 }
 
 static struct in6_ifaddr *
 in6_ifadd(struct nd_prefixctl *pr, int mcast)
 {
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct ifaddr *ifa;
 	struct in6_aliasreq ifra;
 	struct in6_ifaddr *ia, *ib;
 	int error, plen0;
 	struct in6_addr mask;
 	int prefixlen = pr->ndpr_plen;
 	int updateflags;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	in6_prefixlen2mask(&mask, prefixlen);
 
 	/*
 	 * find a link-local address (will be interface ID).
 	 * Is it really mandatory? Theoretically, a global or a site-local
 	 * address can be configured without a link-local address, if we
 	 * have a unique interface identifier...
 	 *
 	 * it is not mandatory to have a link-local address, we can generate
 	 * interface identifier on the fly.  we do this because:
 	 * (1) it should be the easiest way to find interface identifier.
 	 * (2) RFC2462 5.4 suggesting the use of the same interface identifier
 	 * for multiple addresses on a single interface, and possible shortcut
 	 * of DAD.  we omitted DAD for this reason in the past.
 	 * (3) a user can prevent autoconfiguration of global address
 	 * by removing link-local address by hand (this is partly because we
 	 * don't have other way to control the use of IPv6 on an interface.
 	 * this has been our design choice - cf. NRL's "ifconfig auto").
 	 * (4) it is easier to manage when an interface has addresses
 	 * with the same interface identifier, than to have multiple addresses
 	 * with different interface identifiers.
 	 */
 	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp, 0); /* 0 is OK? */
 	if (ifa)
 		ib = (struct in6_ifaddr *)ifa;
 	else
 		return NULL;
 
 	/* prefixlen + ifidlen must be equal to 128 */
 	plen0 = in6_mask2len(&ib->ia_prefixmask.sin6_addr, NULL);
 	if (prefixlen != plen0) {
 		ifa_free(ifa);
 		nd6log((LOG_INFO,
 		    "%s: wrong prefixlen for %s (prefix=%d ifid=%d)\n",
 		    __func__, if_name(ifp), prefixlen, 128 - plen0));
 		return NULL;
 	}
 
 	/* make ifaddr */
 	in6_prepare_ifra(&ifra, &pr->ndpr_prefix.sin6_addr, &mask);
 
 	IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr, &mask);
 	/* interface ID */
 	ifra.ifra_addr.sin6_addr.s6_addr32[0] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[0] & ~mask.s6_addr32[0]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[1] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[1] & ~mask.s6_addr32[1]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[2] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[2] & ~mask.s6_addr32[2]);
 	ifra.ifra_addr.sin6_addr.s6_addr32[3] |=
 	    (ib->ia_addr.sin6_addr.s6_addr32[3] & ~mask.s6_addr32[3]);
 	ifa_free(ifa);
 
 	/* lifetimes. */
 	ifra.ifra_lifetime.ia6t_vltime = pr->ndpr_vltime;
 	ifra.ifra_lifetime.ia6t_pltime = pr->ndpr_pltime;
 
 	/* XXX: scope zone ID? */
 
 	ifra.ifra_flags |= IN6_IFF_AUTOCONF; /* obey autoconf */
 
 	/*
 	 * Make sure that we do not have this address already.  This should
 	 * usually not happen, but we can still see this case, e.g., if we
 	 * have manually configured the exact address to be configured.
 	 */
 	ifa = (struct ifaddr *)in6ifa_ifpwithaddr(ifp,
 	    &ifra.ifra_addr.sin6_addr);
 	if (ifa != NULL) {
 		ifa_free(ifa);
 		/* this should be rare enough to make an explicit log */
 		log(LOG_INFO, "in6_ifadd: %s is already configured\n",
 		    ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr));
 		return (NULL);
 	}
 
 	/*
 	 * Allocate ifaddr structure, link into chain, etc.
 	 * If we are going to create a new address upon receiving a multicasted
 	 * RA, we need to impose a random delay before starting DAD.
 	 * [draft-ietf-ipv6-rfc2462bis-02.txt, Section 5.4.2]
 	 */
 	updateflags = 0;
 	if (mcast)
 		updateflags |= IN6_IFAUPDATE_DADDELAY;
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0) {
 		nd6log((LOG_ERR,
 		    "%s: failed to make ifaddr %s on %s (errno=%d)\n", __func__,
 		    ip6_sprintf(ip6buf, &ifra.ifra_addr.sin6_addr),
 		    if_name(ifp), error));
 		return (NULL);	/* ifaddr must not have been allocated. */
 	}
 
 	ia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr);
 	/*
 	 * XXXRW: Assumption of non-NULLness here might not be true with
 	 * fine-grained locking -- should we validate it?  Or just return
 	 * earlier ifa rather than looking it up again?
 	 */
 	return (ia);		/* this is always non-NULL  and referenced. */
 }
 
 static struct nd_prefix *
 nd6_prefix_lookup_locked(struct nd_prefixctl *key)
 {
 	struct nd_prefix *search;
 
 	ND6_LOCK_ASSERT();
 
 	LIST_FOREACH(search, &V_nd_prefix, ndpr_entry) {
 		if (key->ndpr_ifp == search->ndpr_ifp &&
 		    key->ndpr_plen == search->ndpr_plen &&
 		    in6_are_prefix_equal(&key->ndpr_prefix.sin6_addr,
 		    &search->ndpr_prefix.sin6_addr, key->ndpr_plen)) {
 			nd6_prefix_ref(search);
 			break;
 		}
 	}
 	return (search);
 }
 
 struct nd_prefix *
 nd6_prefix_lookup(struct nd_prefixctl *key)
 {
 	struct nd_prefix *search;
 
 	ND6_RLOCK();
 	search = nd6_prefix_lookup_locked(key);
 	ND6_RUNLOCK();
 	return (search);
 }
 
 void
 nd6_prefix_ref(struct nd_prefix *pr)
 {
 
 	refcount_acquire(&pr->ndpr_refcnt);
 }
 
 void
 nd6_prefix_rele(struct nd_prefix *pr)
 {
 
 	if (refcount_release(&pr->ndpr_refcnt)) {
 		KASSERT(LIST_EMPTY(&pr->ndpr_advrtrs),
 		    ("prefix %p has advertising routers", pr));
 		free(pr, M_IP6NDP);
 	}
 }
 
 int
 nd6_prelist_add(struct nd_prefixctl *pr, struct nd_defrouter *dr,
     struct nd_prefix **newp)
 {
 	struct nd_prefix *new;
 	char ip6buf[INET6_ADDRSTRLEN];
 	int error;
 
 	new = malloc(sizeof(*new), M_IP6NDP, M_NOWAIT | M_ZERO);
 	if (new == NULL)
 		return (ENOMEM);
 	refcount_init(&new->ndpr_refcnt, newp != NULL ? 2 : 1);
 	new->ndpr_ifp = pr->ndpr_ifp;
 	new->ndpr_prefix = pr->ndpr_prefix;
 	new->ndpr_plen = pr->ndpr_plen;
 	new->ndpr_vltime = pr->ndpr_vltime;
 	new->ndpr_pltime = pr->ndpr_pltime;
 	new->ndpr_flags = pr->ndpr_flags;
 	if ((error = in6_init_prefix_ltimes(new)) != 0) {
 		free(new, M_IP6NDP);
 		return (error);
 	}
 	new->ndpr_lastupdate = time_uptime;
 
 	/* initialization */
 	LIST_INIT(&new->ndpr_advrtrs);
 	in6_prefixlen2mask(&new->ndpr_mask, new->ndpr_plen);
 	/* make prefix in the canonical form */
 	IN6_MASK_ADDR(&new->ndpr_prefix.sin6_addr, &new->ndpr_mask);
 
 	ND6_WLOCK();
 	LIST_INSERT_HEAD(&V_nd_prefix, new, ndpr_entry);
 	V_nd6_list_genid++;
 	ND6_WUNLOCK();
 
 	/* ND_OPT_PI_FLAG_ONLINK processing */
 	if (new->ndpr_raf_onlink) {
 		struct epoch_tracker et;
 
 		ND6_ONLINK_LOCK();
 		NET_EPOCH_ENTER(et);
 		if ((error = nd6_prefix_onlink(new)) != 0) {
 			nd6log((LOG_ERR, "%s: failed to make the prefix %s/%d "
 			    "on-link on %s (errno=%d)\n", __func__,
 			    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 			    pr->ndpr_plen, if_name(pr->ndpr_ifp), error));
 			/* proceed anyway. XXX: is it correct? */
 		}
 		NET_EPOCH_EXIT(et);
 		ND6_ONLINK_UNLOCK();
 	}
 
 	if (dr != NULL)
 		pfxrtr_add(new, dr);
 	if (newp != NULL)
 		*newp = new;
 	return (0);
 }
 
 /*
  * Remove a prefix from the prefix list and optionally stash it in a
  * caller-provided list.
  *
  * The ND6 lock must be held.
  */
 void
 nd6_prefix_unlink(struct nd_prefix *pr, struct nd_prhead *list)
 {
 
 	ND6_WLOCK_ASSERT();
 
 	LIST_REMOVE(pr, ndpr_entry);
 	V_nd6_list_genid++;
 	if (list != NULL)
 		LIST_INSERT_HEAD(list, pr, ndpr_entry);
 }
 
 /*
  * Free an unlinked prefix, first marking it off-link if necessary.
  */
 void
 nd6_prefix_del(struct nd_prefix *pr)
 {
 	struct nd_pfxrouter *pfr, *next;
 	int e;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	KASSERT(pr->ndpr_addrcnt == 0,
 	    ("prefix %p has referencing addresses", pr));
 	ND6_UNLOCK_ASSERT();
 
 	/*
 	 * Though these flags are now meaningless, we'd rather keep the value
 	 * of pr->ndpr_raf_onlink and pr->ndpr_raf_auto not to confuse users
 	 * when executing "ndp -p".
 	 */
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0) {
 		ND6_ONLINK_LOCK();
 		if ((e = nd6_prefix_offlink(pr)) != 0) {
 			nd6log((LOG_ERR,
 			    "%s: failed to make the prefix %s/%d offlink on %s "
 			    "(errno=%d)\n", __func__,
 			    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 			    pr->ndpr_plen, if_name(pr->ndpr_ifp), e));
 			/* what should we do? */
 		}
 		ND6_ONLINK_UNLOCK();
 	}
 
 	/* Release references to routers that have advertised this prefix. */
 	ND6_WLOCK();
 	LIST_FOREACH_SAFE(pfr, &pr->ndpr_advrtrs, pfr_entry, next)
 		pfxrtr_del(pfr);
 	ND6_WUNLOCK();
 
 	nd6_prefix_rele(pr);
 
 	pfxlist_onlink_check();
 }
 
 static int
 prelist_update(struct nd_prefixctl *new, struct nd_defrouter *dr,
     struct mbuf *m, int mcast)
 {
 	struct in6_ifaddr *ia6 = NULL, *ia6_match = NULL;
 	struct ifaddr *ifa;
 	struct ifnet *ifp = new->ndpr_ifp;
 	struct nd_prefix *pr;
 	int error = 0;
 	int auth;
 	struct in6_addrlifetime lt6_tmp;
 	char ip6buf[INET6_ADDRSTRLEN];
 
 	NET_EPOCH_ASSERT();
 
 	auth = 0;
 	if (m) {
 		/*
 		 * Authenticity for NA consists authentication for
 		 * both IP header and IP datagrams, doesn't it ?
 		 */
 #if defined(M_AUTHIPHDR) && defined(M_AUTHIPDGM)
 		auth = ((m->m_flags & M_AUTHIPHDR) &&
 		    (m->m_flags & M_AUTHIPDGM));
 #endif
 	}
 
 	if ((pr = nd6_prefix_lookup(new)) != NULL) {
 		/*
 		 * nd6_prefix_lookup() ensures that pr and new have the same
 		 * prefix on a same interface.
 		 */
 
 		/*
 		 * Update prefix information.  Note that the on-link (L) bit
 		 * and the autonomous (A) bit should NOT be changed from 1
 		 * to 0.
 		 */
 		if (new->ndpr_raf_onlink == 1)
 			pr->ndpr_raf_onlink = 1;
 		if (new->ndpr_raf_auto == 1)
 			pr->ndpr_raf_auto = 1;
 		if (new->ndpr_raf_onlink) {
 			pr->ndpr_vltime = new->ndpr_vltime;
 			pr->ndpr_pltime = new->ndpr_pltime;
 			(void)in6_init_prefix_ltimes(pr); /* XXX error case? */
 			pr->ndpr_lastupdate = time_uptime;
 		}
 
 		if (new->ndpr_raf_onlink &&
 		    (pr->ndpr_stateflags & NDPRF_ONLINK) == 0) {
 			ND6_ONLINK_LOCK();
 			if ((error = nd6_prefix_onlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "%s: failed to make the prefix %s/%d "
 				    "on-link on %s (errno=%d)\n", __func__,
 				    ip6_sprintf(ip6buf,
 				        &pr->ndpr_prefix.sin6_addr),
 				    pr->ndpr_plen, if_name(pr->ndpr_ifp),
 				    error));
 				/* proceed anyway. XXX: is it correct? */
 			}
 			ND6_ONLINK_UNLOCK();
 		}
 
 		if (dr != NULL)
 			pfxrtr_add(pr, dr);
 	} else {
 		if (new->ndpr_vltime == 0)
 			goto end;
 		if (new->ndpr_raf_onlink == 0 && new->ndpr_raf_auto == 0)
 			goto end;
 
 		error = nd6_prelist_add(new, dr, &pr);
 		if (error != 0) {
 			nd6log((LOG_NOTICE, "%s: nd6_prelist_add() failed for "
 			    "the prefix %s/%d on %s (errno=%d)\n", __func__,
 			    ip6_sprintf(ip6buf, &new->ndpr_prefix.sin6_addr),
 			    new->ndpr_plen, if_name(new->ndpr_ifp), error));
 			goto end; /* we should just give up in this case. */
 		}
 
 		/*
 		 * XXX: from the ND point of view, we can ignore a prefix
 		 * with the on-link bit being zero.  However, we need a
 		 * prefix structure for references from autoconfigured
 		 * addresses.  Thus, we explicitly make sure that the prefix
 		 * itself expires now.
 		 */
 		if (pr->ndpr_raf_onlink == 0) {
 			pr->ndpr_vltime = 0;
 			pr->ndpr_pltime = 0;
 			in6_init_prefix_ltimes(pr);
 		}
 	}
 
 	/*
 	 * Address autoconfiguration based on Section 5.5.3 of RFC 2462.
 	 * Note that pr must be non NULL at this point.
 	 */
 
 	/* 5.5.3 (a). Ignore the prefix without the A bit set. */
 	if (!new->ndpr_raf_auto)
 		goto end;
 
 	/*
 	 * 5.5.3 (b). the link-local prefix should have been ignored in
 	 * nd6_ra_input.
 	 */
 
 	/* 5.5.3 (c). Consistency check on lifetimes: pltime <= vltime. */
 	if (new->ndpr_pltime > new->ndpr_vltime) {
 		error = EINVAL;	/* XXX: won't be used */
 		goto end;
 	}
 
 	/*
 	 * 5.5.3 (d).  If the prefix advertised is not equal to the prefix of
 	 * an address configured by stateless autoconfiguration already in the
 	 * list of addresses associated with the interface, and the Valid
 	 * Lifetime is not 0, form an address.  We first check if we have
 	 * a matching prefix.
 	 * Note: we apply a clarification in rfc2462bis-02 here.  We only
 	 * consider autoconfigured addresses while RFC2462 simply said
 	 * "address".
 	 */
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		struct in6_ifaddr *ifa6;
 		u_int32_t remaininglifetime;
 
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 
 		ifa6 = (struct in6_ifaddr *)ifa;
 
 		/*
 		 * We only consider autoconfigured addresses as per rfc2462bis.
 		 */
 		if (!(ifa6->ia6_flags & IN6_IFF_AUTOCONF))
 			continue;
 
 		/*
 		 * Spec is not clear here, but I believe we should concentrate
 		 * on unicast (i.e. not anycast) addresses.
 		 * XXX: other ia6_flags? detached or duplicated?
 		 */
 		if ((ifa6->ia6_flags & IN6_IFF_ANYCAST) != 0)
 			continue;
 
 		/*
 		 * Ignore the address if it is not associated with a prefix
 		 * or is associated with a prefix that is different from this
 		 * one.  (pr is never NULL here)
 		 */
 		if (ifa6->ia6_ndpr != pr)
 			continue;
 
 		if (ia6_match == NULL) /* remember the first one */
 			ia6_match = ifa6;
 
 		/*
 		 * An already autoconfigured address matched.  Now that we
 		 * are sure there is at least one matched address, we can
 		 * proceed to 5.5.3. (e): update the lifetimes according to the
 		 * "two hours" rule and the privacy extension.
 		 * We apply some clarifications in rfc2462bis:
 		 * - use remaininglifetime instead of storedlifetime as a
 		 *   variable name
 		 * - remove the dead code in the "two-hour" rule
 		 */
 #define TWOHOUR		(120*60)
 		lt6_tmp = ifa6->ia6_lifetime;
 
 		if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME)
 			remaininglifetime = ND6_INFINITE_LIFETIME;
 		else if (time_uptime - ifa6->ia6_updatetime >
 			 lt6_tmp.ia6t_vltime) {
 			/*
 			 * The case of "invalid" address.  We should usually
 			 * not see this case.
 			 */
 			remaininglifetime = 0;
 		} else
 			remaininglifetime = lt6_tmp.ia6t_vltime -
 			    (time_uptime - ifa6->ia6_updatetime);
 
 		/* when not updating, keep the current stored lifetime. */
 		lt6_tmp.ia6t_vltime = remaininglifetime;
 
 		if (TWOHOUR < new->ndpr_vltime ||
 		    remaininglifetime < new->ndpr_vltime) {
 			lt6_tmp.ia6t_vltime = new->ndpr_vltime;
 		} else if (remaininglifetime <= TWOHOUR) {
 			if (auth) {
 				lt6_tmp.ia6t_vltime = new->ndpr_vltime;
 			}
 		} else {
 			/*
 			 * new->ndpr_vltime <= TWOHOUR &&
 			 * TWOHOUR < remaininglifetime
 			 */
 			lt6_tmp.ia6t_vltime = TWOHOUR;
 		}
 
 		/* The 2 hour rule is not imposed for preferred lifetime. */
 		lt6_tmp.ia6t_pltime = new->ndpr_pltime;
 
 		in6_init_address_ltimes(pr, &lt6_tmp);
 
 		/*
 		 * We need to treat lifetimes for temporary addresses
 		 * differently, according to
 		 * draft-ietf-ipv6-privacy-addrs-v2-01.txt 3.3 (1);
 		 * we only update the lifetimes when they are in the maximum
 		 * intervals.
 		 */
 		if ((ifa6->ia6_flags & IN6_IFF_TEMPORARY) != 0) {
 			u_int32_t maxvltime, maxpltime;
 
 			if (V_ip6_temp_valid_lifetime >
 			    (u_int32_t)((time_uptime - ifa6->ia6_createtime) +
 			    V_ip6_desync_factor)) {
 				maxvltime = V_ip6_temp_valid_lifetime -
 				    (time_uptime - ifa6->ia6_createtime) -
 				    V_ip6_desync_factor;
 			} else
 				maxvltime = 0;
 			if (V_ip6_temp_preferred_lifetime >
 			    (u_int32_t)((time_uptime - ifa6->ia6_createtime) +
 			    V_ip6_desync_factor)) {
 				maxpltime = V_ip6_temp_preferred_lifetime -
 				    (time_uptime - ifa6->ia6_createtime) -
 				    V_ip6_desync_factor;
 			} else
 				maxpltime = 0;
 
 			if (lt6_tmp.ia6t_vltime == ND6_INFINITE_LIFETIME ||
 			    lt6_tmp.ia6t_vltime > maxvltime) {
 				lt6_tmp.ia6t_vltime = maxvltime;
 			}
 			if (lt6_tmp.ia6t_pltime == ND6_INFINITE_LIFETIME ||
 			    lt6_tmp.ia6t_pltime > maxpltime) {
 				lt6_tmp.ia6t_pltime = maxpltime;
 			}
 		}
 		ifa6->ia6_lifetime = lt6_tmp;
 		ifa6->ia6_updatetime = time_uptime;
 	}
 	if (ia6_match == NULL && new->ndpr_vltime) {
 		int ifidlen;
 
 		/*
 		 * 5.5.3 (d) (continued)
 		 * No address matched and the valid lifetime is non-zero.
 		 * Create a new address.
 		 */
 
 		/*
 		 * Prefix Length check:
 		 * If the sum of the prefix length and interface identifier
 		 * length does not equal 128 bits, the Prefix Information
 		 * option MUST be ignored.  The length of the interface
 		 * identifier is defined in a separate link-type specific
 		 * document.
 		 */
 		ifidlen = in6_if2idlen(ifp);
 		if (ifidlen < 0) {
 			/* this should not happen, so we always log it. */
 			log(LOG_ERR, "prelist_update: IFID undefined (%s)\n",
 			    if_name(ifp));
 			goto end;
 		}
 		if (ifidlen + pr->ndpr_plen != 128) {
 			nd6log((LOG_INFO,
 			    "%s: invalid prefixlen %d for %s, ignored\n",
 			    __func__, pr->ndpr_plen, if_name(ifp)));
 			goto end;
 		}
 
 		if ((ia6 = in6_ifadd(new, mcast)) != NULL) {
 			/*
 			 * note that we should use pr (not new) for reference.
 			 */
 			pr->ndpr_addrcnt++;
 			ia6->ia6_ndpr = pr;
 
 			/*
 			 * RFC 3041 3.3 (2).
 			 * When a new public address is created as described
 			 * in RFC2462, also create a new temporary address.
 			 *
 			 * RFC 3041 3.5.
 			 * When an interface connects to a new link, a new
 			 * randomized interface identifier should be generated
 			 * immediately together with a new set of temporary
 			 * addresses.  Thus, we specifiy 1 as the 2nd arg of
 			 * in6_tmpifadd().
 			 */
 			if (V_ip6_use_tempaddr) {
 				int e;
 				if ((e = in6_tmpifadd(ia6, 1, 1)) != 0) {
 					nd6log((LOG_NOTICE, "%s: failed to "
 					    "create a temporary address "
 					    "(errno=%d)\n", __func__, e));
 				}
 			}
 			ifa_free(&ia6->ia_ifa);
 
 			/*
 			 * A newly added address might affect the status
 			 * of other addresses, so we check and update it.
 			 * XXX: what if address duplication happens?
 			 */
 			pfxlist_onlink_check();
 		} else {
 			/* just set an error. do not bark here. */
 			error = EADDRNOTAVAIL; /* XXX: might be unused. */
 		}
 	}
 
 end:
 	if (pr != NULL)
 		nd6_prefix_rele(pr);
 	return (error);
 }
 
 /*
  * A supplement function used in the on-link detection below;
  * detect if a given prefix has a (probably) reachable advertising router.
  * XXX: lengthy function name...
  */
 static struct nd_pfxrouter *
 find_pfxlist_reachable_router(struct nd_prefix *pr)
 {
 	struct epoch_tracker et;
 	struct nd_pfxrouter *pfxrtr;
 
 	ND6_LOCK_ASSERT();
 
 	NET_EPOCH_ENTER(et);
 	LIST_FOREACH(pfxrtr, &pr->ndpr_advrtrs, pfr_entry) {
 		if (is_dr_reachable(pfxrtr->router))
 			break;
 	}
 	NET_EPOCH_EXIT(et);
 	return (pfxrtr);
 }
 
 /*
  * Check if each prefix in the prefix list has at least one available router
  * that advertised the prefix (a router is "available" if its neighbor cache
  * entry is reachable or probably reachable).
  * If the check fails, the prefix may be off-link, because, for example,
  * we have moved from the network but the lifetime of the prefix has not
  * expired yet.  So we should not use the prefix if there is another prefix
  * that has an available router.
  * But, if there is no prefix that has an available router, we still regard
  * all the prefixes as on-link.  This is because we can't tell if all the
  * routers are simply dead or if we really moved from the network and there
  * is no router around us.
  */
 void
 pfxlist_onlink_check(void)
 {
 	struct nd_prefix *pr;
 	struct in6_ifaddr *ifa;
 	struct nd_defrouter *dr;
 	struct nd_pfxrouter *pfxrtr = NULL;
 	struct rm_priotracker in6_ifa_tracker;
 	uint64_t genid;
 	uint32_t flags;
 
 	ND6_ONLINK_LOCK();
 	ND6_RLOCK();
 
 	/*
 	 * Check if there is a prefix that has a reachable advertising
 	 * router.
 	 */
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		if (pr->ndpr_raf_onlink && find_pfxlist_reachable_router(pr))
 			break;
 	}
 
 	/*
 	 * If we have no such prefix, check whether we still have a router
 	 * that does not advertise any prefixes.
 	 */
 	if (pr == NULL) {
 		TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) {
 			struct nd_prefix *pr0;
 
 			LIST_FOREACH(pr0, &V_nd_prefix, ndpr_entry) {
 				if ((pfxrtr = pfxrtr_lookup(pr0, dr)) != NULL)
 					break;
 			}
 			if (pfxrtr != NULL)
 				break;
 		}
 	}
 	if (pr != NULL || (!TAILQ_EMPTY(&V_nd6_defrouter) && pfxrtr == NULL)) {
 		/*
 		 * There is at least one prefix that has a reachable router,
 		 * or at least a router which probably does not advertise
 		 * any prefixes.  The latter would be the case when we move
 		 * to a new link where we have a router that does not provide
 		 * prefixes and we configure an address by hand.
 		 * Detach prefixes which have no reachable advertising
 		 * router, and attach other prefixes.
 		 */
 		LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 			/* XXX: a link-local prefix should never be detached */
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) ||
 			    pr->ndpr_raf_onlink == 0 ||
 			    pr->ndpr_raf_auto == 0)
 				continue;
 
 			if ((pr->ndpr_stateflags & NDPRF_DETACHED) == 0 &&
 			    find_pfxlist_reachable_router(pr) == NULL)
 				pr->ndpr_stateflags |= NDPRF_DETACHED;
 			else if ((pr->ndpr_stateflags & NDPRF_DETACHED) != 0 &&
 			    find_pfxlist_reachable_router(pr) != NULL)
 				pr->ndpr_stateflags &= ~NDPRF_DETACHED;
 		}
 	} else {
 		/* there is no prefix that has a reachable router */
 		LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 			if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) ||
 			    pr->ndpr_raf_onlink == 0 ||
 			    pr->ndpr_raf_auto == 0)
 				continue;
 			pr->ndpr_stateflags &= ~NDPRF_DETACHED;
 		}
 	}
 
 	/*
 	 * Remove each interface route associated with a (just) detached
 	 * prefix, and reinstall the interface route for a (just) attached
 	 * prefix.  Note that all attempt of reinstallation does not
 	 * necessarily success, when a same prefix is shared among multiple
 	 * interfaces.  Such cases will be handled in nd6_prefix_onlink,
 	 * so we don't have to care about them.
 	 */
 restart:
 	LIST_FOREACH(pr, &V_nd_prefix, ndpr_entry) {
 		char ip6buf[INET6_ADDRSTRLEN];
 		int e;
 
 		if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr) ||
 		    pr->ndpr_raf_onlink == 0 ||
 		    pr->ndpr_raf_auto == 0)
 			continue;
 
 		flags = pr->ndpr_stateflags & (NDPRF_DETACHED | NDPRF_ONLINK);
 		if (flags == 0 || flags == (NDPRF_DETACHED | NDPRF_ONLINK)) {
 			genid = V_nd6_list_genid;
 			ND6_RUNLOCK();
 			if ((flags & NDPRF_ONLINK) != 0 &&
 			    (e = nd6_prefix_offlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "%s: failed to make %s/%d offlink "
 				    "(errno=%d)\n", __func__, 
 				    ip6_sprintf(ip6buf,
 					    &pr->ndpr_prefix.sin6_addr),
 					    pr->ndpr_plen, e));
 			} else if ((flags & NDPRF_ONLINK) == 0 &&
 			    (e = nd6_prefix_onlink(pr)) != 0) {
 				nd6log((LOG_ERR,
 				    "%s: failed to make %s/%d onlink "
 				    "(errno=%d)\n", __func__,
 				    ip6_sprintf(ip6buf,
 					    &pr->ndpr_prefix.sin6_addr),
 					    pr->ndpr_plen, e));
 			}
 			ND6_RLOCK();
 			if (genid != V_nd6_list_genid)
 				goto restart;
 		}
 	}
 
 	/*
 	 * Changes on the prefix status might affect address status as well.
 	 * Make sure that all addresses derived from an attached prefix are
 	 * attached, and that all addresses derived from a detached prefix are
 	 * detached.  Note, however, that a manually configured address should
 	 * always be attached.
 	 * The precise detection logic is same as the one for prefixes.
 	 */
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	CK_STAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) {
 		if (!(ifa->ia6_flags & IN6_IFF_AUTOCONF))
 			continue;
 
 		if (ifa->ia6_ndpr == NULL) {
 			/*
 			 * This can happen when we first configure the address
 			 * (i.e. the address exists, but the prefix does not).
 			 * XXX: complicated relationships...
 			 */
 			continue;
 		}
 
 		if (find_pfxlist_reachable_router(ifa->ia6_ndpr))
 			break;
 	}
 	if (ifa) {
 		CK_STAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) {
 			if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 				continue;
 
 			if (ifa->ia6_ndpr == NULL) /* XXX: see above. */
 				continue;
 
 			if (find_pfxlist_reachable_router(ifa->ia6_ndpr)) {
 				if (ifa->ia6_flags & IN6_IFF_DETACHED) {
 					ifa->ia6_flags &= ~IN6_IFF_DETACHED;
 					ifa->ia6_flags |= IN6_IFF_TENTATIVE;
 					nd6_dad_start((struct ifaddr *)ifa, 0);
 				}
 			} else {
 				ifa->ia6_flags |= IN6_IFF_DETACHED;
 			}
 		}
 	} else {
 		CK_STAILQ_FOREACH(ifa, &V_in6_ifaddrhead, ia_link) {
 			if ((ifa->ia6_flags & IN6_IFF_AUTOCONF) == 0)
 				continue;
 
 			if (ifa->ia6_flags & IN6_IFF_DETACHED) {
 				ifa->ia6_flags &= ~IN6_IFF_DETACHED;
 				ifa->ia6_flags |= IN6_IFF_TENTATIVE;
 				/* Do we need a delay in this case? */
 				nd6_dad_start((struct ifaddr *)ifa, 0);
 			}
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	ND6_RUNLOCK();
 	ND6_ONLINK_UNLOCK();
 }
 
 /*
  * Add or remove interface route specified by @dst, @netmask and @ifp.
  * ifa can be NULL.
  * Returns 0 on success
  */
 static int
 nd6_prefix_rtrequest(uint32_t fibnum, int cmd, struct sockaddr_in6 *dst,
     struct sockaddr_in6 *netmask, struct ifnet *ifp, struct ifaddr *ifa)
 {
 	struct epoch_tracker et;
 	int error;
 
 	/* Prepare gateway */
 	struct sockaddr_dl_short sdl = {
 		.sdl_family = AF_LINK,
 		.sdl_len = sizeof(struct sockaddr_dl_short),
 		.sdl_type = ifp->if_type,
 		.sdl_index = ifp->if_index,
 	};
 
 	struct rt_addrinfo info = {
 		.rti_ifa = ifa,
 		.rti_ifp = ifp,
 		.rti_flags = RTF_PINNED | ((netmask != NULL) ? 0 : RTF_HOST),
 		.rti_info = {
 			[RTAX_DST] = (struct sockaddr *)dst,
 			[RTAX_NETMASK] = (struct sockaddr *)netmask,
 			[RTAX_GATEWAY] = (struct sockaddr *)&sdl,
 		},
 	};
 	/* Don't set additional per-gw filters on removal */
 
 	NET_EPOCH_ENTER(et);
 	error = rib_handle_ifaddr_info(fibnum, cmd, &info);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 static int
 nd6_prefix_onlink_rtrequest(struct nd_prefix *pr, struct ifaddr *ifa)
 {
 	int error;
 
 	struct sockaddr_in6 mask6 = {
 		.sin6_family = AF_INET6,
 		.sin6_len = sizeof(struct sockaddr_in6),
 		.sin6_addr = pr->ndpr_mask,
 	};
 	struct sockaddr_in6 *pmask6 = (pr->ndpr_plen != 128) ? &mask6 : NULL;
 
 	error = nd6_prefix_rtrequest(pr->ndpr_ifp->if_fib, RTM_ADD,
 	    &pr->ndpr_prefix, pmask6, pr->ndpr_ifp, ifa);
 	if (error == 0)
 		pr->ndpr_stateflags |= NDPRF_ONLINK;
 
 	return (error);
 }
 
 static int
 nd6_prefix_onlink(struct nd_prefix *pr)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct nd_prefix *opr;
 	char ip6buf[INET6_ADDRSTRLEN];
 	int error;
 
 	ND6_ONLINK_LOCK_ASSERT();
 	ND6_UNLOCK_ASSERT();
 
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) != 0)
 		return (EEXIST);
 
 	/*
 	 * Add the interface route associated with the prefix.  Before
 	 * installing the route, check if there's the same prefix on another
 	 * interface, and the prefix has already installed the interface route.
 	 * Although such a configuration is expected to be rare, we explicitly
 	 * allow it.
 	 */
 	ND6_RLOCK();
 	LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) {
 		if (opr == pr)
 			continue;
 
 		if ((opr->ndpr_stateflags & NDPRF_ONLINK) == 0)
 			continue;
 
 		if (!V_rt_add_addr_allfibs &&
 		    opr->ndpr_ifp->if_fib != pr->ndpr_ifp->if_fib)
 			continue;
 
 		if (opr->ndpr_plen == pr->ndpr_plen &&
 		    in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr,
 		    &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) {
 			ND6_RUNLOCK();
 			return (0);
 		}
 	}
 	ND6_RUNLOCK();
 
 	/*
 	 * We prefer link-local addresses as the associated interface address.
 	 */
 	/* search for a link-local addr */
 	NET_EPOCH_ENTER(et);
 	ifa = (struct ifaddr *)in6ifa_ifpforlinklocal(ifp,
 	    IN6_IFF_NOTREADY | IN6_IFF_ANYCAST);
 	if (ifa == NULL) {
 		/* XXX: freebsd does not have ifa_ifwithaf */
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family == AF_INET6) {
 				ifa_ref(ifa);
 				break;
 			}
 		}
 		/* should we care about ia6_flags? */
 	}
 	if (ifa == NULL) {
 		/*
 		 * This can still happen, when, for example, we receive an RA
 		 * containing a prefix with the L bit set and the A bit clear,
 		 * after removing all IPv6 addresses on the receiving
 		 * interface.  This should, of course, be rare though.
 		 */
 		nd6log((LOG_NOTICE,
 		    "%s: failed to find any ifaddr to add route for a "
 		    "prefix(%s/%d) on %s\n", __func__,
 		    ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen, if_name(ifp)));
 		error = 0;
 	} else {
 		error = nd6_prefix_onlink_rtrequest(pr, ifa);
 		ifa_free(ifa);
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 int
 nd6_prefix_offlink(struct nd_prefix *pr)
 {
 	int error = 0;
 	struct ifnet *ifp = pr->ndpr_ifp;
 	struct nd_prefix *opr;
 	char ip6buf[INET6_ADDRSTRLEN];
 	uint64_t genid;
 	int a_failure;
 
 	ND6_ONLINK_LOCK_ASSERT();
 	ND6_UNLOCK_ASSERT();
 
 	if ((pr->ndpr_stateflags & NDPRF_ONLINK) == 0)
 		return (EEXIST);
 
 	struct sockaddr_in6 mask6 = {
 		.sin6_family = AF_INET6,
 		.sin6_len = sizeof(struct sockaddr_in6),
 		.sin6_addr = pr->ndpr_mask,
 	};
 	struct sockaddr_in6 *pmask6 = (pr->ndpr_plen != 128) ? &mask6 : NULL;
 
 	error = nd6_prefix_rtrequest(ifp->if_fib, RTM_DELETE,
 	    &pr->ndpr_prefix, pmask6, ifp, NULL);
 
 	a_failure = 1;
 	if (error == 0) {
 		pr->ndpr_stateflags &= ~NDPRF_ONLINK;
 
 		/*
 		 * There might be the same prefix on another interface,
 		 * the prefix which could not be on-link just because we have
 		 * the interface route (see comments in nd6_prefix_onlink).
 		 * If there's one, try to make the prefix on-link on the
 		 * interface.
 		 */
 		ND6_RLOCK();
 restart:
 		LIST_FOREACH(opr, &V_nd_prefix, ndpr_entry) {
 			/*
 			 * KAME specific: detached prefixes should not be
 			 * on-link.
 			 */
 			if (opr == pr || (opr->ndpr_stateflags &
 			    (NDPRF_ONLINK | NDPRF_DETACHED)) != 0)
 				continue;
 
 			if (opr->ndpr_plen == pr->ndpr_plen &&
 			    in6_are_prefix_equal(&pr->ndpr_prefix.sin6_addr,
 			    &opr->ndpr_prefix.sin6_addr, pr->ndpr_plen)) {
 				int e;
 
 				genid = V_nd6_list_genid;
 				ND6_RUNLOCK();
 				if ((e = nd6_prefix_onlink(opr)) != 0) {
 					nd6log((LOG_ERR,
 					    "%s: failed to recover a prefix "
 					    "%s/%d from %s to %s (errno=%d)\n",
 					    __func__, ip6_sprintf(ip6buf,
 						&opr->ndpr_prefix.sin6_addr),
 					    opr->ndpr_plen, if_name(ifp),
 					    if_name(opr->ndpr_ifp), e));
 				} else
 					a_failure = 0;
 				ND6_RLOCK();
 				if (genid != V_nd6_list_genid)
 					goto restart;
 			}
 		}
 		ND6_RUNLOCK();
 	} else {
 		/* XXX: can we still set the NDPRF_ONLINK flag? */
 		nd6log((LOG_ERR,
 		    "%s: failed to delete route: %s/%d on %s (errno=%d)\n",
 		    __func__, ip6_sprintf(ip6buf, &pr->ndpr_prefix.sin6_addr),
 		    pr->ndpr_plen, if_name(ifp), error));
 	}
 
 	if (a_failure)
 		lltable_prefix_free(AF_INET6,
 		    (struct sockaddr *)&pr->ndpr_prefix,
 		    (struct sockaddr *)&mask6, LLE_STATIC);
 
 	return (error);
 }
 
 /*
  * ia0 - corresponding public address
  */
 int
 in6_tmpifadd(const struct in6_ifaddr *ia0, int forcegen, int delay)
 {
 	struct ifnet *ifp = ia0->ia_ifa.ifa_ifp;
 	struct in6_ifaddr *newia;
 	struct in6_aliasreq ifra;
 	int error;
 	int trylimit = 3;	/* XXX: adhoc value */
 	int updateflags;
 	u_int32_t randid[2];
 	time_t vltime0, pltime0;
 
 	in6_prepare_ifra(&ifra, &ia0->ia_addr.sin6_addr,
 	    &ia0->ia_prefixmask.sin6_addr);
 
 	ifra.ifra_addr = ia0->ia_addr;	/* XXX: do we need this ? */
 	/* clear the old IFID */
 	IN6_MASK_ADDR(&ifra.ifra_addr.sin6_addr,
 	    &ifra.ifra_prefixmask.sin6_addr);
 
   again:
 	if (in6_get_tmpifid(ifp, (u_int8_t *)randid,
 	    (const u_int8_t *)&ia0->ia_addr.sin6_addr.s6_addr[8], forcegen)) {
 		nd6log((LOG_NOTICE, "%s: failed to find a good random IFID\n",
 		    __func__));
 		return (EINVAL);
 	}
 	ifra.ifra_addr.sin6_addr.s6_addr32[2] |=
 	    (randid[0] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[2]));
 	ifra.ifra_addr.sin6_addr.s6_addr32[3] |=
 	    (randid[1] & ~(ifra.ifra_prefixmask.sin6_addr.s6_addr32[3]));
 
 	/*
 	 * in6_get_tmpifid() quite likely provided a unique interface ID.
 	 * However, we may still have a chance to see collision, because
 	 * there may be a time lag between generation of the ID and generation
 	 * of the address.  So, we'll do one more sanity check.
 	 */
 
 	if (in6_localip(&ifra.ifra_addr.sin6_addr) != 0) {
 		if (trylimit-- > 0) {
 			forcegen = 1;
 			goto again;
 		}
 
 		/* Give up.  Something strange should have happened.  */
 		nd6log((LOG_NOTICE, "%s: failed to find a unique random IFID\n",
 		    __func__));
 		return (EEXIST);
 	}
 
 	/*
 	 * The Valid Lifetime is the lower of the Valid Lifetime of the
          * public address or TEMP_VALID_LIFETIME.
 	 * The Preferred Lifetime is the lower of the Preferred Lifetime
          * of the public address or TEMP_PREFERRED_LIFETIME -
          * DESYNC_FACTOR.
 	 */
 	if (ia0->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
 		vltime0 = IFA6_IS_INVALID(ia0) ? 0 :
 		    (ia0->ia6_lifetime.ia6t_vltime -
 		    (time_uptime - ia0->ia6_updatetime));
 		if (vltime0 > V_ip6_temp_valid_lifetime)
 			vltime0 = V_ip6_temp_valid_lifetime;
 	} else
 		vltime0 = V_ip6_temp_valid_lifetime;
 	if (ia0->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
 		pltime0 = IFA6_IS_DEPRECATED(ia0) ? 0 :
 		    (ia0->ia6_lifetime.ia6t_pltime -
 		    (time_uptime - ia0->ia6_updatetime));
 		if (pltime0 > V_ip6_temp_preferred_lifetime - V_ip6_desync_factor){
 			pltime0 = V_ip6_temp_preferred_lifetime -
 			    V_ip6_desync_factor;
 		}
 	} else
 		pltime0 = V_ip6_temp_preferred_lifetime - V_ip6_desync_factor;
 	ifra.ifra_lifetime.ia6t_vltime = vltime0;
 	ifra.ifra_lifetime.ia6t_pltime = pltime0;
 
 	/*
 	 * A temporary address is created only if this calculated Preferred
 	 * Lifetime is greater than REGEN_ADVANCE time units.
 	 */
 	if (ifra.ifra_lifetime.ia6t_pltime <= V_ip6_temp_regen_advance)
 		return (0);
 
 	/* XXX: scope zone ID? */
 
 	ifra.ifra_flags |= (IN6_IFF_AUTOCONF|IN6_IFF_TEMPORARY);
 
 	/* allocate ifaddr structure, link into chain, etc. */
 	updateflags = 0;
 	if (delay)
 		updateflags |= IN6_IFAUPDATE_DADDELAY;
 	if ((error = in6_update_ifa(ifp, &ifra, NULL, updateflags)) != 0)
 		return (error);
 
 	newia = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr);
 	if (newia == NULL) {	/* XXX: can it happen? */
 		nd6log((LOG_ERR,
 		    "%s: ifa update succeeded, but we got no ifaddr\n",
 		    __func__));
 		return (EINVAL); /* XXX */
 	}
 	newia->ia6_ndpr = ia0->ia6_ndpr;
 	newia->ia6_ndpr->ndpr_addrcnt++;
 	ifa_free(&newia->ia_ifa);
 
 	/*
 	 * A newly added address might affect the status of other addresses.
 	 * XXX: when the temporary address is generated with a new public
 	 * address, the onlink check is redundant.  However, it would be safe
 	 * to do the check explicitly everywhere a new address is generated,
 	 * and, in fact, we surely need the check when we create a new
 	 * temporary address due to deprecation of an old temporary address.
 	 */
 	pfxlist_onlink_check();
 
 	return (0);
 }
 
 static int
 rt6_deleteroute(const struct rtentry *rt, const struct nhop_object *nh,
     void *arg)
 {
 	struct in6_addr *gate = (struct in6_addr *)arg;
 	int nh_rt_flags;
 
 	if (nh->gw_sa.sa_family != AF_INET6)
 		return (0);
 
 	if (!IN6_ARE_ADDR_EQUAL(gate, &nh->gw6_sa.sin6_addr)) {
 		return (0);
 	}
 
 	/*
 	 * Do not delete a static route.
 	 * XXX: this seems to be a bit ad-hoc. Should we consider the
 	 * 'cloned' bit instead?
 	 */
 	nh_rt_flags = nhop_get_rtflags(nh);
 	if ((nh_rt_flags & RTF_STATIC) != 0)
 		return (0);
 
 	/*
 	 * We delete only host route. This means, in particular, we don't
 	 * delete default route.
 	 */
 	if ((nh_rt_flags & RTF_HOST) == 0)
 		return (0);
 
 	return (1);
 #undef SIN6
 }
 
 /*
  * Delete all the routing table entries that use the specified gateway.
  * XXX: this function causes search through all entries of routing table, so
  * it shouldn't be called when acting as a router.
  */
 void
 rt6_flush(struct in6_addr *gateway, struct ifnet *ifp)
 {
 
 	/* We'll care only link-local addresses */
 	if (!IN6_IS_ADDR_LINKLOCAL(gateway))
 		return;
 
 	/* XXX Do we really need to walk any but the default FIB? */
 	rib_foreach_table_walk_del(AF_INET6, rt6_deleteroute, (void *)gateway);
 }
 
 int
 nd6_setdefaultiface(int ifindex)
 {
 
 	if (V_nd6_defifindex != ifindex) {
 		V_nd6_defifindex = ifindex;
 		if (V_nd6_defifindex != 0) {
 			struct epoch_tracker et;
 
 			/*
 			 * XXXGL: this function should use ifnet_byindex_ref!
 			 */
 			NET_EPOCH_ENTER(et);
 			V_nd6_defifp = ifnet_byindex(V_nd6_defifindex);
 			NET_EPOCH_EXIT(et);
 			if (V_nd6_defifp == NULL)
 				return (EINVAL);
 		} else
 			V_nd6_defifp = NULL;
 
 		/*
 		 * Our current implementation assumes one-to-one mapping between
 		 * interfaces and links, so it would be natural to use the
 		 * default interface as the default link.
 		 */
 		scope6_setdefault(V_nd6_defifp);
 	}
 
 	return (0);
 }
 
 bool
 nd6_defrouter_list_empty(void)
 {
 
 	return (TAILQ_EMPTY(&V_nd6_defrouter));
 }
 
 void
 nd6_defrouter_timer(void)
 {
 	struct nd_defrouter *dr, *ndr;
 	struct nd6_drhead drq;
 
 	TAILQ_INIT(&drq);
 
 	ND6_WLOCK();
 	TAILQ_FOREACH_SAFE(dr, &V_nd6_defrouter, dr_entry, ndr)
 		if (dr->expire && dr->expire < time_uptime)
 			defrouter_unlink(dr, &drq);
 	ND6_WUNLOCK();
 
 	while ((dr = TAILQ_FIRST(&drq)) != NULL) {
 		TAILQ_REMOVE(&drq, dr, dr_entry);
 		defrouter_del(dr);
 	}
 }
 
 /*
  * Nuke default router list entries toward ifp.
  * We defer removal of default router list entries that is installed in the
  * routing table, in order to keep additional side effects as small as possible.
  */
 void
 nd6_defrouter_purge(struct ifnet *ifp)
 {
 	struct nd_defrouter *dr, *ndr;
 	struct nd6_drhead drq;
 
 	TAILQ_INIT(&drq);
 
 	ND6_WLOCK();
 	TAILQ_FOREACH_SAFE(dr, &V_nd6_defrouter, dr_entry, ndr) {
 		if (dr->installed)
 			continue;
 		if (dr->ifp == ifp)
 			defrouter_unlink(dr, &drq);
 	}
 	TAILQ_FOREACH_SAFE(dr, &V_nd6_defrouter, dr_entry, ndr) {
 		if (!dr->installed)
 			continue;
 		if (dr->ifp == ifp)
 			defrouter_unlink(dr, &drq);
 	}
 	ND6_WUNLOCK();
 
 	/* Delete the unlinked router objects. */
 	while ((dr = TAILQ_FIRST(&drq)) != NULL) {
 		TAILQ_REMOVE(&drq, dr, dr_entry);
 		defrouter_del(dr);
 	}
 }
 
 void
 nd6_defrouter_flush_all(void)
 {
 	struct nd_defrouter *dr;
 	struct nd6_drhead drq;
 
 	TAILQ_INIT(&drq);
 
 	ND6_WLOCK();
 	while ((dr = TAILQ_FIRST(&V_nd6_defrouter)) != NULL)
 		defrouter_unlink(dr, &drq);
 	ND6_WUNLOCK();
 
 	while ((dr = TAILQ_FIRST(&drq)) != NULL) {
 		TAILQ_REMOVE(&drq, dr, dr_entry);
 		defrouter_del(dr);
 	}
 }
 
 void
 nd6_defrouter_init(void)
 {
 
 	TAILQ_INIT(&V_nd6_defrouter);
 }
 
 static int
 nd6_sysctl_drlist(SYSCTL_HANDLER_ARGS)
 {
 	struct in6_defrouter d;
 	struct nd_defrouter *dr;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	bzero(&d, sizeof(d));
 	d.rtaddr.sin6_family = AF_INET6;
 	d.rtaddr.sin6_len = sizeof(d.rtaddr);
 
 	ND6_RLOCK();
 	TAILQ_FOREACH(dr, &V_nd6_defrouter, dr_entry) {
 		d.rtaddr.sin6_addr = dr->rtaddr;
 		error = sa6_recoverscope(&d.rtaddr);
 		if (error != 0)
 			break;
 		d.flags = dr->raflags;
 		d.rtlifetime = dr->rtlifetime;
 		d.expire = dr->expire + (time_second - time_uptime);
 		d.if_index = dr->ifp->if_index;
 		error = SYSCTL_OUT(req, &d, sizeof(d));
 		if (error != 0)
 			break;
 	}
 	ND6_RUNLOCK();
 	return (error);
 }
 SYSCTL_PROC(_net_inet6_icmp6, ICMPV6CTL_ND6_DRLIST, nd6_drlist,
 	CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	NULL, 0, nd6_sysctl_drlist, "S,in6_defrouter",
 	"NDP default router list");
diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c
index 0ae5bbcc9218..9cfbbaedf51c 100644
--- a/sys/netinet6/raw_ip6.c
+++ b/sys/netinet6/raw_ip6.c
@@ -1,863 +1,864 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)raw_ip.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipsec.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/errno.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 
 #include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet/ip_var.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/raw_ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/send.h>
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/stdarg.h>
 
 #define	satosin6(sa)	((struct sockaddr_in6 *)(sa))
 #define	ifatoia6(ifa)	((struct in6_ifaddr *)(ifa))
 
 /*
  * Raw interface to IP6 protocol.
  */
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 #define	V_ripcbinfo			VNET(ripcbinfo)
 
 extern u_long	rip_sendspace;
 extern u_long	rip_recvspace;
 
 VNET_PCPUSTAT_DEFINE(struct rip6stat, rip6stat);
 VNET_PCPUSTAT_SYSINIT(rip6stat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(rip6stat);
 #endif /* VIMAGE */
 
 /*
  * Hooks for multicast routing. They all default to NULL, so leave them not
  * initialized and rely on BSS being set to 0.
  */
 
 /*
  * The socket used to communicate with the multicast routing daemon.
  */
 VNET_DEFINE(struct socket *, ip6_mrouter);
 
 /*
  * The various mrouter functions.
  */
 int (*ip6_mrouter_set)(struct socket *, struct sockopt *);
 int (*ip6_mrouter_get)(struct socket *, struct sockopt *);
 int (*ip6_mrouter_done)(void);
 int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *);
 int (*mrt6_ioctl)(u_long, caddr_t);
 
 struct rip6_inp_match_ctx {
 	struct ip6_hdr *ip6;
 	int proto;
 };
 
 static bool
 rip6_inp_match(const struct inpcb *inp, void *v)
 {
 	struct rip6_inp_match_ctx *c = v;
 	struct ip6_hdr *ip6 = c->ip6;
 	int proto = c->proto;
 
 	/* XXX inp locking */
 	if ((inp->inp_vflag & INP_IPV6) == 0)
 		return (false);
 	if (inp->inp_ip_p && inp->inp_ip_p != proto)
 		return (false);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
 	    !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
 		return (false);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 	    !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
 		return (false);
 
 	return (true);
 }
 
 /*
  * Setup generic address and protocol structures for raw_input routine, then
  * pass them along with mbuf chain.
  */
 int
 rip6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct ifnet *ifp;
 	struct mbuf *n, *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct inpcb *inp;
 	struct mbuf *opts = NULL;
 	struct sockaddr_in6 fromsa;
 	struct rip6_inp_match_ctx ctx = { .ip6 = ip6, .proto = proto };
 	struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
 	    INPLOOKUP_RLOCKPCB, rip6_inp_match, &ctx);
 	int delivered = 0;
 
 	NET_EPOCH_ASSERT();
 
 	RIP6STAT_INC(rip6s_ipackets);
 
 	init_sin6(&fromsa, m, 0); /* general init */
 
 	ifp = m->m_pkthdr.rcvif;
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		INP_RLOCK_ASSERT(inp);
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		/*
 		 * Check AH/ESP integrity.
 		 */
 		if (IPSEC_ENABLED(ipv6) &&
 		    IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) {
 			/* Do not inject data into pcb. */
 			continue;
 		}
 #endif /* IPSEC */
 		if (jailed_without_vnet(inp->inp_cred) &&
 		    !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 		    prison_check_ip6(inp->inp_cred, &ip6->ip6_dst) != 0)
 			/*
 			 * Allow raw socket in jail to receive multicast;
 			 * assume process had PRIV_NETINET_RAW at attach,
 			 * and fall through into normal filter path if so.
 			 */
 			continue;
 		if (inp->in6p_cksum != -1) {
 			RIP6STAT_INC(rip6s_isum);
 			if (m->m_pkthdr.len - (*offp + inp->in6p_cksum) < 2 ||
 			    in6_cksum(m, proto, *offp,
 			    m->m_pkthdr.len - *offp)) {
 				RIP6STAT_INC(rip6s_badsum);
 				/*
 				 * Drop the received message, don't send an
 				 * ICMP6 message. Set proto to IPPROTO_NONE
 				 * to achieve that.
 				 */
 				INP_RUNLOCK(inp);
 				proto = IPPROTO_NONE;
 				break;
 			}
 		}
 		/*
 		 * If this raw socket has multicast state, and we
 		 * have received a multicast, check if this socket
 		 * should receive it, as multicast filtering is now
 		 * the responsibility of the transport layer.
 		 */
 		if (inp->in6p_moptions &&
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 			/*
 			 * If the incoming datagram is for MLD, allow it
 			 * through unconditionally to the raw socket.
 			 *
 			 * Use the M_RTALERT_MLD flag to check for MLD
 			 * traffic without having to inspect the mbuf chain
 			 * more deeply, as all MLDv1/v2 host messages MUST
 			 * contain the Router Alert option.
 			 *
 			 * In the case of MLDv1, we may not have explicitly
 			 * joined the group, and may have set IFF_ALLMULTI
 			 * on the interface. im6o_mc_filter() may discard
 			 * control traffic we actually need to see.
 			 *
 			 * Userland multicast routing daemons should continue
 			 * filter the control traffic appropriately.
 			 */
 			int blocked;
 
 			blocked = MCAST_PASS;
 			if ((m->m_flags & M_RTALERT_MLD) == 0) {
 				struct sockaddr_in6 mcaddr;
 
 				bzero(&mcaddr, sizeof(struct sockaddr_in6));
 				mcaddr.sin6_len = sizeof(struct sockaddr_in6);
 				mcaddr.sin6_family = AF_INET6;
 				mcaddr.sin6_addr = ip6->ip6_dst;
 
 				blocked = im6o_mc_filter(inp->in6p_moptions,
 				    ifp,
 				    (struct sockaddr *)&mcaddr,
 				    (struct sockaddr *)&fromsa);
 			}
 			if (blocked != MCAST_PASS) {
 				IP6STAT_INC(ip6s_notmember);
 				continue;
 			}
 		}
 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL)
 			continue;
 		if (inp->inp_flags & INP_CONTROLOPTS ||
 		    inp->inp_socket->so_options & SO_TIMESTAMP)
 			ip6_savecontrol(inp, n, &opts);
 		/* strip intermediate headers */
 		m_adj(n, *offp);
 		if (sbappendaddr(&inp->inp_socket->so_rcv,
 		    (struct sockaddr *)&fromsa, n, opts) == 0) {
 			soroverflow(inp->inp_socket);
 			m_freem(n);
 			if (opts)
 				m_freem(opts);
 			RIP6STAT_INC(rip6s_fullsock);
 		} else {
 			sorwakeup(inp->inp_socket);
 			delivered++;
 		}
 		opts = NULL;
 	}
 	if (delivered == 0) {
 		RIP6STAT_INC(rip6s_nosock);
 		if (m->m_flags & M_MCAST)
 			RIP6STAT_INC(rip6s_nosockmcast);
 		if (proto == IPPROTO_NONE)
 			m_freem(m);
 		else
 			icmp6_error(m, ICMP6_PARAM_PROB,
 			    ICMP6_PARAMPROB_NEXTHEADER,
 			    ip6_get_prevhdr(m, *offp));
 		IP6STAT_DEC(ip6s_delivered);
 	} else
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 void
 rip6_ctlinput(struct ip6ctlparam *ip6cp)
 {
 	int errno;
 
 	if ((errno = icmp6_errmap(ip6cp->ip6c_icmp6)) != 0)
 		in6_pcbnotify(&V_ripcbinfo, ip6cp->ip6c_finaldst, 0,
 		    ip6cp->ip6c_src, 0, errno, ip6cp->ip6c_cmdarg,
 		    in6_rtchange);
 }
 
 /*
  * Generate IPv6 header and pass packet to ip6_output.  Tack on options user
  * may have setup with control call.
  */
 static int
 rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	struct sockaddr_in6 tmp, *dstsock;
 	struct m_tag *mtag;
 	struct ip6_hdr *ip6;
 	u_int	plen = m->m_pkthdr.len;
 	struct ip6_pktopts opt, *optp;
 	struct ifnet *oifp = NULL;
 	int error;
 	int type = 0, code = 0;		/* for ICMPv6 output statistics only */
 	int scope_ambiguous = 0;
 	int use_defzone = 0;
 	int hlim = 0;
 	struct in6_addr in6a;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_send: inp == NULL"));
 
 	/* Always copy sockaddr to avoid overwrites. */
 	/* Unlocked read. */
 	if (so->so_state & SS_ISCONNECTED) {
 		if (nam) {
 			error = EISCONN;
 			goto release;
 		}
 		tmp = (struct sockaddr_in6 ){
 			.sin6_family = AF_INET6,
 			.sin6_len = sizeof(struct sockaddr_in6),
 		};
 		INP_RLOCK(inp);
 		bcopy(&inp->in6p_faddr, &tmp.sin6_addr,
 		    sizeof(struct in6_addr));
 		INP_RUNLOCK(inp);
 		dstsock = &tmp;
 	} else {
 		if (nam == NULL)
 			error = ENOTCONN;
 		else if (nam->sa_family != AF_INET6)
 			error = EAFNOSUPPORT;
 		else if (nam->sa_len != sizeof(struct sockaddr_in6))
 			error = EINVAL;
 		else
 			error = 0;
 		if (error != 0)
 			goto release;
 		dstsock = (struct sockaddr_in6 *)nam;
 		if (dstsock->sin6_family != AF_INET6) {
 			error = EAFNOSUPPORT;
 			goto release;
 		}
 	}
 
 	INP_WLOCK(inp);
 
 	if (control != NULL) {
 		NET_EPOCH_ENTER(et);
 		error = ip6_setpktopts(control, &opt, inp->in6p_outputopts,
 		    so->so_cred, inp->inp_ip_p);
 		NET_EPOCH_EXIT(et);
 
 		if (error != 0) {
 			goto bad;
 		}
 		optp = &opt;
 	} else
 		optp = inp->in6p_outputopts;
 
 	/*
 	 * Check and convert scope zone ID into internal form.
 	 *
 	 * XXX: we may still need to determine the zone later.
 	 */
 	if (!(so->so_state & SS_ISCONNECTED)) {
 		if (!optp || !optp->ip6po_pktinfo ||
 		    !optp->ip6po_pktinfo->ipi6_ifindex)
 			use_defzone = V_ip6_use_defzone;
 		if (dstsock->sin6_scope_id == 0 && !use_defzone)
 			scope_ambiguous = 1;
 		if ((error = sa6_embedscope(dstsock, use_defzone)) != 0)
 			goto bad;
 	}
 
 	/*
 	 * For an ICMPv6 packet, we should know its type and code to update
 	 * statistics.
 	 */
 	if (inp->inp_ip_p == IPPROTO_ICMPV6) {
 		struct icmp6_hdr *icmp6;
 		if (m->m_len < sizeof(struct icmp6_hdr) &&
 		    (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
 			error = ENOBUFS;
 			goto bad;
 		}
 		icmp6 = mtod(m, struct icmp6_hdr *);
 		type = icmp6->icmp6_type;
 		code = icmp6->icmp6_code;
 	}
 
 	M_PREPEND(m, sizeof(*ip6), M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto bad;
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 
 #ifdef ROUTE_MPATH
 	if (CALC_FLOWID_OUTBOUND) {
 		uint32_t hash_type, hash_val;
 
 		hash_val = fib6_calc_software_hash(&inp->in6p_laddr,
 		    &dstsock->sin6_addr, 0, 0, inp->inp_ip_p, &hash_type);
 		inp->inp_flowid = hash_val;
 		inp->inp_flowtype = hash_type;
 	}
 #endif
 	/*
 	 * Source address selection.
 	 */
 	NET_EPOCH_ENTER(et);
 	error = in6_selectsrc_socket(dstsock, optp, inp, so->so_cred,
 	    scope_ambiguous, &in6a, &hlim);
 	NET_EPOCH_EXIT(et);
 
 	if (error)
 		goto bad;
 	error = prison_check_ip6(inp->inp_cred, &in6a);
 	if (error != 0)
 		goto bad;
 	ip6->ip6_src = in6a;
 
 	ip6->ip6_dst = dstsock->sin6_addr;
 
 	/*
 	 * Fill in the rest of the IPv6 header fields.
 	 */
 	ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 	    (inp->inp_flow & IPV6_FLOWINFO_MASK);
 	ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 	    (IPV6_VERSION & IPV6_VERSION_MASK);
 
 	/*
 	 * ip6_plen will be filled in ip6_output, so not fill it here.
 	 */
 	ip6->ip6_nxt = inp->inp_ip_p;
 	ip6->ip6_hlim = hlim;
 
 	if (inp->inp_ip_p == IPPROTO_ICMPV6 || inp->in6p_cksum != -1) {
 		struct mbuf *n;
 		int off;
 		u_int16_t *p;
 
 		/* Compute checksum. */
 		if (inp->inp_ip_p == IPPROTO_ICMPV6)
 			off = offsetof(struct icmp6_hdr, icmp6_cksum);
 		else
 			off = inp->in6p_cksum;
 		if (plen < off + 2) {
 			error = EINVAL;
 			goto bad;
 		}
 		off += sizeof(struct ip6_hdr);
 
 		n = m;
 		while (n && n->m_len <= off) {
 			off -= n->m_len;
 			n = n->m_next;
 		}
 		if (!n)
 			goto bad;
 		p = (u_int16_t *)(mtod(n, caddr_t) + off);
 		*p = 0;
 		*p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen);
 	}
 
 	/*
 	 * Send RA/RS messages to user land for protection, before sending
 	 * them to rtadvd/rtsol.
 	 */
 	if ((send_sendso_input_hook != NULL) &&
 	    inp->inp_ip_p == IPPROTO_ICMPV6) {
 		switch (type) {
 		case ND_ROUTER_ADVERT:
 		case ND_ROUTER_SOLICIT:
 			mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
 				sizeof(unsigned short), M_NOWAIT);
 			if (mtag == NULL)
 				goto bad;
 			m_tag_prepend(m, mtag);
 		}
 	}
 
 	NET_EPOCH_ENTER(et);
 	error = ip6_output(m, optp, NULL, 0, inp->in6p_moptions, &oifp, inp);
 	NET_EPOCH_EXIT(et);
 	if (inp->inp_ip_p == IPPROTO_ICMPV6) {
 		if (oifp)
 			icmp6_ifoutstat_inc(oifp, type, code);
 		ICMP6STAT_INC(icp6s_outhist[type]);
 	} else
 		RIP6STAT_INC(rip6s_opackets);
 
 	goto freectl;
 
  bad:
 	if (m)
 		m_freem(m);
 
  freectl:
 	if (control != NULL) {
 		ip6_clearpktopts(&opt, -1);
 		m_freem(control);
 	}
 	INP_WUNLOCK(inp);
 	return (error);
 
 release:
 	if (control != NULL)
 		m_freem(control);
 	m_freem(m);
 	return (error);
 }
 
 /*
  * Raw IPv6 socket option processing.
  */
 int
 rip6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	int error;
 
 	if (sopt->sopt_level == IPPROTO_ICMPV6)
 		/*
 		 * XXX: is it better to call icmp6_ctloutput() directly
 		 * from protosw?
 		 */
 		return (icmp6_ctloutput(so, sopt));
 	else if (sopt->sopt_level != IPPROTO_IPV6) {
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_name == SO_SETFIB) {
 			INP_WLOCK(inp);
 			inp->inp_inc.inc_fibnum = so->so_fibnum;
 			INP_WUNLOCK(inp);
 			return (0);
 		}
 		return (EINVAL);
 	}
 
 	error = 0;
 
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case MRT6_INIT:
 		case MRT6_DONE:
 		case MRT6_ADD_MIF:
 		case MRT6_DEL_MIF:
 		case MRT6_ADD_MFC:
 		case MRT6_DEL_MFC:
 		case MRT6_PIM:
 			if (inp->inp_ip_p != IPPROTO_ICMPV6)
 				return (EOPNOTSUPP);
 			error = ip6_mrouter_get ?  ip6_mrouter_get(so, sopt) :
 			    EOPNOTSUPP;
 			break;
 		case IPV6_CHECKSUM:
 			error = ip6_raw_ctloutput(so, sopt);
 			break;
 		default:
 			error = ip6_ctloutput(so, sopt);
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case MRT6_INIT:
 		case MRT6_DONE:
 		case MRT6_ADD_MIF:
 		case MRT6_DEL_MIF:
 		case MRT6_ADD_MFC:
 		case MRT6_DEL_MFC:
 		case MRT6_PIM:
 			if (inp->inp_ip_p != IPPROTO_ICMPV6)
 				return (EOPNOTSUPP);
 			error = ip6_mrouter_set ?  ip6_mrouter_set(so, sopt) :
 			    EOPNOTSUPP;
 			break;
 		case IPV6_CHECKSUM:
 			error = ip6_raw_ctloutput(so, sopt);
 			break;
 		default:
 			error = ip6_ctloutput(so, sopt);
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 static int
 rip6_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	struct icmp6_filter *filter;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("rip6_attach: inp != NULL"));
 
 	error = priv_check(td, PRIV_NETINET_RAW);
 	if (error)
 		return (error);
 	if (proto >= IPPROTO_MAX || proto < 0)
 		return (EPROTONOSUPPORT);
 	error = soreserve(so, rip_sendspace, rip_recvspace);
 	if (error)
 		return (error);
 	filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT);
 	if (filter == NULL)
 		return (ENOMEM);
 	error = in_pcballoc(so, &V_ripcbinfo);
 	if (error) {
 		free(filter, M_PCB);
 		return (error);
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_ip_p = proto;
 	inp->in6p_cksum = -1;
 	inp->in6p_icmp6filt = filter;
 	ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static void
 rip6_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_detach: inp == NULL"));
 
 	if (so == V_ip6_mrouter && ip6_mrouter_done)
 		ip6_mrouter_done();
 	/* xxx: RSVP */
 	INP_WLOCK(inp);
 	free(inp->in6p_icmp6filt, M_PCB);
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 }
 
 /* XXXRW: This can't ever be called. */
 static void
 rip6_abort(struct socket *so)
 {
 	struct inpcb *inp __diagused;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_abort: inp == NULL"));
 
 	soisdisconnected(so);
 }
 
 static void
 rip6_close(struct socket *so)
 {
 	struct inpcb *inp __diagused;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_close: inp == NULL"));
 
 	soisdisconnected(so);
 }
 
 static int
 rip6_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL"));
 
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return (ENOTCONN);
 	inp->in6p_faddr = in6addr_any;
 	rip6_abort(so);
 	return (0);
 }
 
 static int
 rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
 	struct ifaddr *ifa = NULL;
 	int error = 0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_bind: inp == NULL"));
 
 	if (nam->sa_family != AF_INET6)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 	if ((error = prison_check_ip6(td->td_ucred, &addr->sin6_addr)) != 0)
 		return (error);
 	if (CK_STAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6)
 		return (EADDRNOTAVAIL);
 	if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
 		return (error);
 
 	NET_EPOCH_ENTER(et);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) &&
 	    (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (EADDRNOTAVAIL);
 	}
 	if (ifa != NULL &&
 	    ((struct in6_ifaddr *)ifa)->ia6_flags &
 	    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|
 	     IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
 		NET_EPOCH_EXIT(et);
 		return (EADDRNOTAVAIL);
 	}
 	NET_EPOCH_EXIT(et);
 	INP_WLOCK(inp);
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	inp->in6p_laddr = addr->sin6_addr;
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
 	struct in6_addr in6a;
 	struct epoch_tracker et;
 	int error = 0, scope_ambiguous = 0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_connect: inp == NULL"));
 
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 	if (CK_STAILQ_EMPTY(&V_ifnet))
 		return (EADDRNOTAVAIL);
 	if (addr->sin6_family != AF_INET6)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * Application should provide a proper zone ID or the use of default
 	 * zone IDs should be enabled.  Unfortunately, some applications do
 	 * not behave as it should, so we need a workaround.  Even if an
 	 * appropriate ID is not determined, we'll see if we can determine
 	 * the outgoing interface.  If we can, determine the zone ID based on
 	 * the interface below.
 	 */
 	if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone)
 		scope_ambiguous = 1;
 	if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
 		return (error);
 
 	INP_WLOCK(inp);
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	/* Source address selection. XXX: need pcblookup? */
 	NET_EPOCH_ENTER(et);
 	error = in6_selectsrc_socket(addr, inp->in6p_outputopts,
 	    inp, so->so_cred, scope_ambiguous, &in6a, NULL);
 	NET_EPOCH_EXIT(et);
 	if (error) {
 		INP_INFO_WUNLOCK(&V_ripcbinfo);
 		INP_WUNLOCK(inp);
 		return (error);
 	}
 
 	inp->in6p_faddr = addr->sin6_addr;
 	inp->in6p_laddr = in6a;
 	soisconnected(so);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 rip6_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL"));
 
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 struct protosw rip6_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_ctloutput =		rip6_ctloutput,
 	.pr_abort =		rip6_abort,
 	.pr_attach =		rip6_attach,
 	.pr_bind =		rip6_bind,
 	.pr_connect =		rip6_connect,
 	.pr_control =		in6_control,
 	.pr_detach =		rip6_detach,
 	.pr_disconnect =	rip6_disconnect,
 	.pr_peeraddr =		in6_getpeeraddr,
 	.pr_send =		rip6_send,
 	.pr_shutdown =		rip6_shutdown,
 	.pr_sockaddr =		in6_getsockaddr,
 	.pr_close =		rip6_close
 };
diff --git a/sys/netinet6/scope6.c b/sys/netinet6/scope6.c
index 7957cec44f79..c4eb9a4ab829 100644
--- a/sys/netinet6/scope6.c
+++ b/sys/netinet6/scope6.c
@@ -1,610 +1,611 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 2000 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: scope6.c,v 1.10 2000/07/24 13:29:31 itojun Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/systm.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 
 #include <netinet/ip6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 
 #ifdef ENABLE_DEFAULT_SCOPE
 VNET_DEFINE(int, ip6_use_defzone) = 1;
 #else
 VNET_DEFINE(int, ip6_use_defzone) = 0;
 #endif
 SYSCTL_DECL(_net_inet6_ip6);
 
 /*
  * The scope6_lock protects the global sid default stored in
  * sid_default below.
  */
 static struct mtx scope6_lock;
 #define	SCOPE6_LOCK_INIT()	mtx_init(&scope6_lock, "scope6_lock", NULL, MTX_DEF)
 #define	SCOPE6_LOCK()		mtx_lock(&scope6_lock)
 #define	SCOPE6_UNLOCK()		mtx_unlock(&scope6_lock)
 #define	SCOPE6_LOCK_ASSERT()	mtx_assert(&scope6_lock, MA_OWNED)
 
 VNET_DEFINE_STATIC(struct scope6_id, sid_default);
 #define	V_sid_default			VNET(sid_default)
 
 #define SID(ifp) \
 	(((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->scope6_id)
 
 static int	scope6_get(struct ifnet *, struct scope6_id *);
 static int	scope6_set(struct ifnet *, struct scope6_id *);
 
 void
 scope6_init(void)
 {
 
 	bzero(&V_sid_default, sizeof(V_sid_default));
 
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	SCOPE6_LOCK_INIT();
 }
 
 struct scope6_id *
 scope6_ifattach(struct ifnet *ifp)
 {
 	struct scope6_id *sid;
 
 	sid = malloc(sizeof(*sid), M_IFADDR, M_WAITOK | M_ZERO);
 	/*
 	 * XXX: IPV6_ADDR_SCOPE_xxx macros are not standard.
 	 * Should we rather hardcode here?
 	 */
 	sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index;
 	sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index;
 	return (sid);
 }
 
 void
 scope6_ifdetach(struct scope6_id *sid)
 {
 
 	free(sid, M_IFADDR);
 }
 
 int
 scope6_ioctl(u_long cmd, caddr_t data, struct ifnet *ifp)
 {
 	struct in6_ifreq *ifr;
 
 	if (ifp->if_afdata[AF_INET6] == NULL)
 		return (EPFNOSUPPORT);
 
 	ifr = (struct in6_ifreq *)data;
 	switch (cmd) {
 	case SIOCSSCOPE6:
 		return (scope6_set(ifp,
 		    (struct scope6_id *)ifr->ifr_ifru.ifru_scope_id));
 	case SIOCGSCOPE6:
 		return (scope6_get(ifp,
 		    (struct scope6_id *)ifr->ifr_ifru.ifru_scope_id));
 	case SIOCGSCOPE6DEF:
 		return (scope6_get_default(
 		    (struct scope6_id *)ifr->ifr_ifru.ifru_scope_id));
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static int
 scope6_set(struct ifnet *ifp, struct scope6_id *idlist)
 {
 	int i;
 	int error = 0;
 	struct scope6_id *sid = NULL;
 
 	IF_AFDATA_WLOCK(ifp);
 	sid = SID(ifp);
 
 	if (!sid) {	/* paranoid? */
 		IF_AFDATA_WUNLOCK(ifp);
 		return (EINVAL);
 	}
 
 	/*
 	 * XXX: We need more consistency checks of the relationship among
 	 * scopes (e.g. an organization should be larger than a site).
 	 */
 
 	/*
 	 * TODO(XXX): after setting, we should reflect the changes to
 	 * interface addresses, routing table entries, PCB entries...
 	 */
 
 	for (i = 0; i < 16; i++) {
 		if (idlist->s6id_list[i] &&
 		    idlist->s6id_list[i] != sid->s6id_list[i]) {
 			/*
 			 * An interface zone ID must be the corresponding
 			 * interface index by definition.
 			 */
 			if (i == IPV6_ADDR_SCOPE_INTFACELOCAL &&
 			    idlist->s6id_list[i] != ifp->if_index) {
 				IF_AFDATA_WUNLOCK(ifp);
 				return (EINVAL);
 			}
 
 			if (i == IPV6_ADDR_SCOPE_LINKLOCAL) {
 				struct epoch_tracker et;
 
 				NET_EPOCH_ENTER(et);
 				if (!ifnet_byindex(idlist->s6id_list[i])) {
 					/*
 					 * XXX: theoretically, there should be
 					 * no relationship between link IDs and
 					 * interface IDs, but we check the
 					 * consistency for safety in later use.
 					 */
 					NET_EPOCH_EXIT(et);
 					IF_AFDATA_WUNLOCK(ifp);
 					return (EINVAL);
 				}
 				NET_EPOCH_EXIT(et);
 			}
 
 			/*
 			 * XXX: we must need lots of work in this case,
 			 * but we simply set the new value in this initial
 			 * implementation.
 			 */
 			sid->s6id_list[i] = idlist->s6id_list[i];
 		}
 	}
 	IF_AFDATA_WUNLOCK(ifp);
 
 	return (error);
 }
 
 static int
 scope6_get(struct ifnet *ifp, struct scope6_id *idlist)
 {
 	struct epoch_tracker et;
 	struct scope6_id *sid;
 
 	/* We only need to lock the interface's afdata for SID() to work. */
 	NET_EPOCH_ENTER(et);
 	sid = SID(ifp);
 	if (sid == NULL) {	/* paranoid? */
 		NET_EPOCH_EXIT(et);
 		return (EINVAL);
 	}
 
 	*idlist = *sid;
 
 	NET_EPOCH_EXIT(et);
 	return (0);
 }
 
 /*
  * Get a scope of the address. Node-local, link-local, site-local or global.
  */
 int
 in6_addrscope(const struct in6_addr *addr)
 {
 
 	if (IN6_IS_ADDR_MULTICAST(addr)) {
 		/*
 		 * Addresses with reserved value F must be treated as
 		 * global multicast addresses.
 		 */
 		if (IPV6_ADDR_MC_SCOPE(addr) == 0x0f)
 			return (IPV6_ADDR_SCOPE_GLOBAL);
 		return (IPV6_ADDR_MC_SCOPE(addr));
 	}
 	if (IN6_IS_ADDR_LINKLOCAL(addr) ||
 	    IN6_IS_ADDR_LOOPBACK(addr))
 		return (IPV6_ADDR_SCOPE_LINKLOCAL);
 	if (IN6_IS_ADDR_SITELOCAL(addr))
 		return (IPV6_ADDR_SCOPE_SITELOCAL);
 	return (IPV6_ADDR_SCOPE_GLOBAL);
 }
 
 /*
  * ifp - note that this might be NULL
  */
 
 void
 scope6_setdefault(struct ifnet *ifp)
 {
 
 	/*
 	 * Currently, this function just sets the default "interfaces"
 	 * and "links" according to the given interface.
 	 * We might eventually have to separate the notion of "link" from
 	 * "interface" and provide a user interface to set the default.
 	 */
 	SCOPE6_LOCK();
 	if (ifp) {
 		V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] =
 			ifp->if_index;
 		V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] =
 			ifp->if_index;
 	} else {
 		V_sid_default.s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = 0;
 		V_sid_default.s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = 0;
 	}
 	SCOPE6_UNLOCK();
 }
 
 int
 scope6_get_default(struct scope6_id *idlist)
 {
 
 	SCOPE6_LOCK();
 	*idlist = V_sid_default;
 	SCOPE6_UNLOCK();
 
 	return (0);
 }
 
 u_int32_t
 scope6_addr2default(struct in6_addr *addr)
 {
 	u_int32_t id;
 
 	/*
 	 * special case: The loopback address should be considered as
 	 * link-local, but there's no ambiguity in the syntax.
 	 */
 	if (IN6_IS_ADDR_LOOPBACK(addr))
 		return (0);
 
 	/*
 	 * XXX: 32-bit read is atomic on all our platforms, is it OK
 	 * not to lock here?
 	 */
 	SCOPE6_LOCK();
 	id = V_sid_default.s6id_list[in6_addrscope(addr)];
 	SCOPE6_UNLOCK();
 	return (id);
 }
 
 /*
  * Validate the specified scope zone ID in the sin6_scope_id field.  If the ID
  * is unspecified (=0), needs to be specified, and the default zone ID can be
  * used, the default value will be used.
  * This routine then generates the kernel-internal form: if the address scope
  * of is interface-local or link-local, embed the interface index in the
  * address.
  */
 int
 sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok)
 {
 	u_int32_t zoneid;
 
 	if ((zoneid = sin6->sin6_scope_id) == 0 && defaultok)
 		zoneid = scope6_addr2default(&sin6->sin6_addr);
 
 	if (zoneid != 0 &&
 	    (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
 	    IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))) {
 		struct epoch_tracker et;
 
 		/*
 		 * At this moment, we only check interface-local and
 		 * link-local scope IDs, and use interface indices as the
 		 * zone IDs assuming a one-to-one mapping between interfaces
 		 * and links.
 		 */
 		NET_EPOCH_ENTER(et);
 		if (ifnet_byindex(zoneid) == NULL) {
 			NET_EPOCH_EXIT(et);
 			return (ENXIO);
 		}
 		NET_EPOCH_EXIT(et);
 
 		/* XXX assignment to 16bit from 32bit variable */
 		sin6->sin6_addr.s6_addr16[1] = htons(zoneid & 0xffff);
 		sin6->sin6_scope_id = 0;
 	}
 
 	return 0;
 }
 
 /*
  * generate standard sockaddr_in6 from embedded form.
  */
 int
 sa6_recoverscope(struct sockaddr_in6 *sin6)
 {
 	char ip6buf[INET6_ADDRSTRLEN];
 	u_int32_t zoneid;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
 	    IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr)) {
 		/*
 		 * KAME assumption: link id == interface id
 		 */
 		zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]);
 		if (zoneid) {
 			struct epoch_tracker et;
 
 			NET_EPOCH_ENTER(et);
 			/* sanity check */
 			if (!ifnet_byindex(zoneid)) {
 				NET_EPOCH_EXIT(et);
 				return (ENXIO);
 			}
 			NET_EPOCH_EXIT(et);
 			if (sin6->sin6_scope_id != 0 &&
 			    zoneid != sin6->sin6_scope_id) {
 				log(LOG_NOTICE,
 				    "%s: embedded scope mismatch: %s%%%d. "
 				    "sin6_scope_id was overridden\n", __func__,
 				    ip6_sprintf(ip6buf, &sin6->sin6_addr),
 				    sin6->sin6_scope_id);
 			}
 			sin6->sin6_addr.s6_addr16[1] = 0;
 			sin6->sin6_scope_id = zoneid;
 		}
 	}
 
 	return 0;
 }
 
 /*
  * Determine the appropriate scope zone ID for in6 and ifp.  If ret_id is
  * non NULL, it is set to the zone ID.  If the zone ID needs to be embedded
  * in the in6_addr structure, in6 will be modified.
  *
  * ret_id - unnecessary?
  */
 int
 in6_setscope(struct in6_addr *in6, struct ifnet *ifp, u_int32_t *ret_id)
 {
 	int scope;
 	u_int32_t zoneid = 0;
 	struct scope6_id *sid;
 
 	/*
 	 * special case: the loopback address can only belong to a loopback
 	 * interface.
 	 */
 	if (IN6_IS_ADDR_LOOPBACK(in6)) {
 		if (!(ifp->if_flags & IFF_LOOPBACK))
 			return (EINVAL);
 	} else {
 		scope = in6_addrscope(in6);
 		if (scope == IPV6_ADDR_SCOPE_INTFACELOCAL ||
 		    scope == IPV6_ADDR_SCOPE_LINKLOCAL) {
 			/*
 			 * Currently we use interface indices as the
 			 * zone IDs for interface-local and link-local
 			 * scopes.
 			 */
 			zoneid = ifp->if_index;
 			in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */
 		} else if (scope != IPV6_ADDR_SCOPE_GLOBAL) {
 			struct epoch_tracker et;
 
 			NET_EPOCH_ENTER(et);
 			if (ifp->if_afdata[AF_INET6] == NULL) {
 				NET_EPOCH_EXIT(et);
 				return (ENETDOWN);
 			}
 			sid = SID(ifp);
 			zoneid = sid->s6id_list[scope];
 			NET_EPOCH_EXIT(et);
 		}
 	}
 
 	if (ret_id != NULL)
 		*ret_id = zoneid;
 
 	return (0);
 }
 
 /*
  * Just clear the embedded scope identifier.  Return 0 if the original address
  * is intact; return non 0 if the address is modified.
  */
 int
 in6_clearscope(struct in6_addr *in6)
 {
 	int modified = 0;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) {
 		if (in6->s6_addr16[1] != 0)
 			modified = 1;
 		in6->s6_addr16[1] = 0;
 	}
 
 	return (modified);
 }
 
 /*
  * Return the scope identifier or zero.
  */
 uint16_t
 in6_getscope(const struct in6_addr *in6)
 {
 
 	if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6))
 		return (in6->s6_addr16[1]);
 
 	return (0);
 }
 
 /*
  * Returns scope zone id for the unicast address @in6.
  *
  * Returns 0 for global unicast and loopback addresses.
  * Returns interface index for the link-local addresses.
  */
 uint32_t
 in6_get_unicast_scopeid(const struct in6_addr *in6, const struct ifnet *ifp)
 {
 
 	if (IN6_IS_SCOPE_LINKLOCAL(in6))
 		return (ifp->if_index);
 	return (0);
 }
 
 void
 in6_set_unicast_scopeid(struct in6_addr *in6, uint32_t scopeid)
 {
 
 	in6->s6_addr16[1] = htons(scopeid & 0xffff);
 }
 
 /*
  * Return pointer to ifnet structure, corresponding to the zone id of
  * link-local scope.
  */
 struct ifnet*
 in6_getlinkifnet(uint32_t zoneid)
 {
 
 	return (ifnet_byindex((u_short)zoneid));
 }
 
 /*
  * Return zone id for the specified scope.
  */
 uint32_t
 in6_getscopezone(const struct ifnet *ifp, int scope)
 {
 
 	if (scope == IPV6_ADDR_SCOPE_INTFACELOCAL ||
 	    scope == IPV6_ADDR_SCOPE_LINKLOCAL)
 		return (ifp->if_index);
 	if (scope >= 0 && scope < IPV6_ADDR_SCOPES_COUNT)
 		return (SID(ifp)->s6id_list[scope]);
 	return (0);
 }
 
 /*
  * Extracts scope from adddress @dst, stores cleared address
  * inside @dst and zone inside @scopeid
  */
 void
 in6_splitscope(const struct in6_addr *src, struct in6_addr *dst,
     uint32_t *scopeid)
 {
 	uint32_t zoneid;
 
 	*dst = *src;
 	zoneid = ntohs(in6_getscope(dst));
 	in6_clearscope(dst);
 	*scopeid = zoneid;
 }
 
 /*
  * This function is for checking sockaddr_in6 structure passed
  * from the application level (usually).
  *
  * sin6_scope_id should be set for link-local unicast, link-local and
  * interface-local  multicast addresses.
  *
  * If it is zero, then look into default zone ids. If default zone id is
  * not set or disabled, then return error.
  */
 int
 sa6_checkzone(struct sockaddr_in6 *sa6)
 {
 	int scope;
 
 	scope = in6_addrscope(&sa6->sin6_addr);
 	if (scope == IPV6_ADDR_SCOPE_GLOBAL)
 		return (sa6->sin6_scope_id ? EINVAL: 0);
 	if (IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr) &&
 	    scope != IPV6_ADDR_SCOPE_LINKLOCAL &&
 	    scope != IPV6_ADDR_SCOPE_INTFACELOCAL) {
 		if (sa6->sin6_scope_id == 0 && V_ip6_use_defzone != 0)
 			sa6->sin6_scope_id = V_sid_default.s6id_list[scope];
 		return (0);
 	}
 	/*
 	 * Since ::1 address always configured on the lo0, we can
 	 * automatically set its zone id, when it is not specified.
 	 * Return error, when specified zone id doesn't match with
 	 * actual value.
 	 */
 	if (IN6_IS_ADDR_LOOPBACK(&sa6->sin6_addr)) {
 		if (sa6->sin6_scope_id == 0)
 			sa6->sin6_scope_id = in6_getscopezone(V_loif, scope);
 		else if (sa6->sin6_scope_id != in6_getscopezone(V_loif, scope))
 			return (EADDRNOTAVAIL);
 	}
 	/* XXX: we can validate sin6_scope_id here */
 	if (sa6->sin6_scope_id != 0)
 		return (0);
 	if (V_ip6_use_defzone != 0)
 		sa6->sin6_scope_id = V_sid_default.s6id_list[scope];
 	/* Return error if we can't determine zone id */
 	return (sa6->sin6_scope_id ? 0: EADDRNOTAVAIL);
 }
 
 /*
  * This function is similar to sa6_checkzone, but it uses given ifp
  * to initialize sin6_scope_id.
  */
 int
 sa6_checkzone_ifp(struct ifnet *ifp, struct sockaddr_in6 *sa6)
 {
 	int scope;
 
 	scope = in6_addrscope(&sa6->sin6_addr);
 	if (scope == IPV6_ADDR_SCOPE_LINKLOCAL ||
 	    scope == IPV6_ADDR_SCOPE_INTFACELOCAL) {
 		if (sa6->sin6_scope_id == 0) {
 			sa6->sin6_scope_id = in6_getscopezone(ifp, scope);
 			return (0);
 		} else if (sa6->sin6_scope_id != in6_getscopezone(ifp, scope))
 			return (EADDRNOTAVAIL);
 	}
 	return (sa6_checkzone(sa6));
 }
diff --git a/sys/netinet6/send.c b/sys/netinet6/send.c
index 0684e2eea22c..6d3496f5b1af 100644
--- a/sys/netinet6/send.c
+++ b/sys/netinet6/send.c
@@ -1,389 +1,390 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009-2010 Ana Kukec <anchie@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/sockbuf.h>
 #include <sys/socketvar.h>
 #include <sys/types.h>
 
 #include <net/route.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/send.h>
 
 static MALLOC_DEFINE(M_SEND, "send", "Secure Neighbour Discovery");
 
 /*
  * The socket used to communicate with the SeND daemon.
  */
 VNET_DEFINE_STATIC(struct socket *, send_so);
 #define	V_send_so	VNET(send_so)
 
 u_long	send_sendspace	= 8 * (1024 + sizeof(struct sockaddr_send));
 u_long	send_recvspace	= 9216;
 
 struct mtx	send_mtx;
 #define SEND_LOCK_INIT()	mtx_init(&send_mtx, "send_mtx", NULL, MTX_DEF)
 #define SEND_LOCK()		mtx_lock(&send_mtx)
 #define SEND_UNLOCK()		mtx_unlock(&send_mtx)
 #define SEND_LOCK_DESTROY()     mtx_destroy(&send_mtx)
 
 static int
 send_attach(struct socket *so, int proto, struct thread *td)
 {
 	int error;
 
 	SEND_LOCK();
 	if (V_send_so != NULL) {
 		SEND_UNLOCK();
 		return (EEXIST);
 	}
 
 	error = priv_check(td, PRIV_NETINET_RAW);
 	if (error) {
 		SEND_UNLOCK();
 		return(error);
 	}
 
 	if (proto != IPPROTO_SEND) {
 		SEND_UNLOCK();
 		return (EPROTONOSUPPORT);
 	}
 	error = soreserve(so, send_sendspace, send_recvspace);
 	if (error) {
 		SEND_UNLOCK();
 		return(error);
 	}
 
 	V_send_so = so;
 	SEND_UNLOCK();
 
 	return (0);
 }
 
 static int
 send_output(struct mbuf *m, struct ifnet *ifp, int direction)
 {
 	struct ip6_hdr *ip6;
 	struct sockaddr_in6 dst;
 	struct icmp6_hdr *icmp6;
 	struct epoch_tracker et;
 	int icmp6len;
 	int error;
 
 	/*
 	 * Receive incoming (SeND-protected) or outgoing traffic
 	 * (SeND-validated) from the SeND user space application.
 	 */
 
 	switch (direction) {
 	case SND_IN:
 		if (m->m_len < (sizeof(struct ip6_hdr) +
 		    sizeof(struct icmp6_hdr))) {
 			m = m_pullup(m, sizeof(struct ip6_hdr) +
 			    sizeof(struct icmp6_hdr));
 			if (!m)
 				return (ENOBUFS);
 		}
 
 		/* Before passing off the mbuf record the proper interface. */
 		m->m_pkthdr.rcvif = ifp;
 
 		if (m->m_flags & M_PKTHDR)
 			icmp6len = m->m_pkthdr.len - sizeof(struct ip6_hdr);
 		else
 			panic("Doh! not the first mbuf.");
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		icmp6 = (struct icmp6_hdr *)(ip6 + 1);
 		error = 0;
 
 		/*
 		 * Output the packet as icmp6.c:icpm6_input() would do.
 		 * The mbuf is always consumed, so we do not have to
 		 * care about that.
 		 */
 		NET_EPOCH_ENTER(et);
 		switch (icmp6->icmp6_type) {
 		case ND_NEIGHBOR_SOLICIT:
 			nd6_ns_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		case ND_NEIGHBOR_ADVERT:
 			nd6_na_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		case ND_REDIRECT:
 			icmp6_redirect_input(m, sizeof(struct ip6_hdr));
 			break;
 		case ND_ROUTER_SOLICIT:
 			nd6_rs_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		case ND_ROUTER_ADVERT:
 			nd6_ra_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		default:
 			m_freem(m);
 			error = ENOSYS;
 		}
 		NET_EPOCH_EXIT(et);
 
 		return (error);
 
 	case SND_OUT:
 		if (m->m_len < sizeof(struct ip6_hdr)) {
 			m = m_pullup(m, sizeof(struct ip6_hdr));
 			if (!m)
 				return (ENOBUFS);
 		}
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
 			m->m_flags |= M_MCAST;
 
 		bzero(&dst, sizeof(dst));
 		dst.sin6_family = AF_INET6;
 		dst.sin6_len = sizeof(dst);
 		dst.sin6_addr = ip6->ip6_dst;
 
 		m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 
 		IP_PROBE(send, NULL, NULL, ip6, ifp, NULL, ip6);
 
 		/*
 		 * Output the packet as nd6.c:nd6_output_lle() would do.
 		 * The mbuf is always consumed, so we do not have to care
 		 * about that.
 		 * XXX-BZ as we added data, what about fragmenting,
 		 * if now needed?
 		 */
 		error = ((*ifp->if_output)(ifp, m, (struct sockaddr *)&dst,
 		    NULL));
 		if (error)
 			error = ENOENT;
 		return (error);
 
 	default:
 		panic("%s: direction %d neither SND_IN nor SND_OUT.",
 		     __func__, direction);
 	}
 }
 
 /*
  * Receive a SeND message from user space to be either send out by the kernel
  * or, with SeND ICMPv6 options removed, to be further processed by the icmp6
  * input path.
  */
 static int
 send_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct sockaddr_send *sendsrc;
 	struct ifnet *ifp;
 	int error;
 
 	KASSERT(V_send_so == so, ("%s: socket %p not send socket %p",
 		__func__, so, V_send_so));
 
 	sendsrc = (struct sockaddr_send *)nam;
 	if (sendsrc->send_family != AF_INET6) {
 		error = EAFNOSUPPORT;
 		goto err;
 	}
 	if (sendsrc->send_len != sizeof(*sendsrc)) {
 		error = EINVAL;
 		goto err;
 	}
 	ifp = ifnet_byindex_ref(sendsrc->send_ifidx);
 	if (ifp == NULL) {
 		error = ENETUNREACH;
 		goto err;
 	}
 
 	error = send_output(m, ifp, sendsrc->send_direction);
 	if_rele(ifp);
 	m = NULL;
 
 err:
 	if (control != NULL)
 		m_freem(control);
 	if (m != NULL)
 		m_freem(m);
 
 	return (error);
 }
 
 static void
 send_close(struct socket *so)
 {
 
 	SEND_LOCK();
 	if (V_send_so)
 		V_send_so = NULL;
 	SEND_UNLOCK();
 }
 
 /*
  * Send a SeND message to user space, that was either received and has to be
  * validated or was about to be send out and has to be handled by the SEND
  * daemon adding SeND ICMPv6 options.
  */
 static int
 send_input(struct mbuf *m, struct ifnet *ifp, int direction, int msglen __unused)
 {
 	struct ip6_hdr *ip6;
 	struct sockaddr_send sendsrc;
 
 	SEND_LOCK();
 	if (V_send_so == NULL) {
 		SEND_UNLOCK();
 		return (-1);
 	}
 
 	/*
 	 * Make sure to clear any possible internally embedded scope before
 	 * passing the packet to user space for SeND cryptographic signature
 	 * validation to succeed.
 	 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	bzero(&sendsrc, sizeof(sendsrc));
 	sendsrc.send_len = sizeof(sendsrc);
 	sendsrc.send_family = AF_INET6;
 	sendsrc.send_direction = direction;
 	sendsrc.send_ifidx = ifp->if_index;
 
 	/*
 	 * Send incoming or outgoing traffic to user space either to be
 	 * protected (outgoing) or validated (incoming) according to rfc3971.
 	 */
 	SOCKBUF_LOCK(&V_send_so->so_rcv);
 	if (sbappendaddr_locked(&V_send_so->so_rcv,
 	    (struct sockaddr *)&sendsrc, m, NULL) == 0) {
 		soroverflow_locked(V_send_so);
 		/* XXX stats. */
 		m_freem(m);
 	} else {
 		sorwakeup_locked(V_send_so);
 	}
 
 	SEND_UNLOCK();
 	return (0);
 }
 
 static struct protosw send_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_protocol =		IPPROTO_SEND,
 	.pr_attach =		send_attach,
 	.pr_send =		send_send,
 	.pr_detach =		send_close
 };
 
 static int
 send_modevent(module_t mod, int type, void *unused)
 {
 #ifdef __notyet__
 	VNET_ITERATOR_DECL(vnet_iter);
 #endif
 	int error;
 
 	switch (type) {
 	case MOD_LOAD:
 		SEND_LOCK_INIT();
 
 		error = protosw_register(&inet6domain, &send_protosw);
 		if (error != 0) {
 			printf("%s:%d: MOD_LOAD pf_proto_register(): %d\n",
 			   __func__, __LINE__, error);
 			SEND_LOCK_DESTROY();
 			break;
 		}
 		send_sendso_input_hook = send_input;
 		break;
 	case MOD_UNLOAD:
 		/* Do not allow unloading w/o locking. */
 		return (EBUSY);
 #ifdef __notyet__
 		VNET_LIST_RLOCK_NOSLEEP();
 		SEND_LOCK();
 		VNET_FOREACH(vnet_iter) {
 			CURVNET_SET(vnet_iter);
 			if (V_send_so != NULL) {
 				CURVNET_RESTORE();
 				SEND_UNLOCK();
 				VNET_LIST_RUNLOCK_NOSLEEP();
 				return (EBUSY);
 			}
 			CURVNET_RESTORE();
 		}
 		SEND_UNLOCK();
 		VNET_LIST_RUNLOCK_NOSLEEP();
 		error = protosw_unregister(&send_protosw);
 		if (error == 0)
 			SEND_LOCK_DESTROY();
 		send_sendso_input_hook = NULL;
 		break;
 #endif
 	default:
 		error = 0;
 		break;
 	}
 
 	return (error);
 }
 
 static moduledata_t sendmod = {
 	"send",
 	send_modevent,
 	0
 };
 
 DECLARE_MODULE(send, sendmod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
diff --git a/sys/netipsec/ipsec_input.c b/sys/netipsec/ipsec_input.c
index 9feb0df8bd15..7abf74426853 100644
--- a/sys/netipsec/ipsec_input.c
+++ b/sys/netipsec/ipsec_input.c
@@ -1,776 +1,777 @@
 /*	$OpenBSD: ipsec_input.c,v 1.63 2003/02/20 18:35:43 deraadt Exp $	*/
 /*-
  * The authors of this code are John Ioannidis (ji@tla.org),
  * Angelos D. Keromytis (kermit@csd.uch.gr) and
  * Niels Provos (provos@physnet.uni-hamburg.de).
  *
  * This code was written by John Ioannidis for BSD/OS in Athens, Greece,
  * in November 1995.
  *
  * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
  * by Angelos D. Keromytis.
  *
  * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
  * and Niels Provos.
  *
  * Additional features in 1999 by Angelos D. Keromytis.
  *
  * Copyright (C) 1995, 1996, 1997, 1998, 1999 by John Ioannidis,
  * Angelos D. Keromytis and Niels Provos.
  * Copyright (c) 2001, Angelos D. Keromytis.
  * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
  *
  * Permission to use, copy, and modify this software with or without fee
  * is hereby granted, provided that this entire notice is included in
  * all copies of any software which is or includes a copy or
  * modification of this software.
  * You may use this code under the GNU public license if you so wish. Please
  * contribute changes back to the authors under this freer than GPL license
  * so that we may further the use of strong encryption without limitations to
  * all.
  *
  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
  * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
  * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
  * PURPOSE.
  */
 
 /*
  * IPsec input processing.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/errno.h>
 #include <sys/hhook.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_enc.h>
+#include <net/if_private.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/in_var.h>
 #include <netinet/tcp_var.h>
 
 #include <netinet/ip6.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/in_pcb.h>
 #ifdef INET6
 #include <netinet/icmp6.h>
 #endif
 
 #include <netipsec/ipsec.h>
 #ifdef INET6
 #include <netipsec/ipsec6.h>
 #endif
 #include <netipsec/ipsec_support.h>
 #include <netipsec/ah_var.h>
 #include <netipsec/esp.h>
 #include <netipsec/esp_var.h>
 #include <netipsec/ipcomp_var.h>
 
 #include <netipsec/key.h>
 #include <netipsec/keydb.h>
 #include <netipsec/key_debug.h>
 
 #include <netipsec/xform.h>
 
 #include <machine/in_cksum.h>
 #include <machine/stdarg.h>
 
 #define	IPSEC_ISTAT(proto, name)	do {	\
 	if ((proto) == IPPROTO_ESP)		\
 		ESPSTAT_INC(esps_##name);	\
 	else if ((proto) == IPPROTO_AH)		\
 		AHSTAT_INC(ahs_##name);		\
 	else					\
 		IPCOMPSTAT_INC(ipcomps_##name);	\
 } while (0)
 
 /*
  * ipsec_common_input gets called when an IPsec-protected packet
  * is received by IPv4 or IPv6.  Its job is to find the right SA
  * and call the appropriate transform.  The transform callback
  * takes care of further processing (like ingress filtering).
  */
 static int
 ipsec_common_input(struct mbuf *m, int skip, int protoff, int af, int sproto)
 {
 	IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
 	union sockaddr_union dst_address;
 	struct secasvar *sav;
 	uint32_t spi;
 	int error;
 
 	IPSEC_ISTAT(sproto, input);
 
 	IPSEC_ASSERT(m != NULL, ("null packet"));
 
 	IPSEC_ASSERT(sproto == IPPROTO_ESP || sproto == IPPROTO_AH ||
 		sproto == IPPROTO_IPCOMP,
 		("unexpected security protocol %u", sproto));
 
 	if ((sproto == IPPROTO_ESP && !V_esp_enable) ||
 	    (sproto == IPPROTO_AH && !V_ah_enable) ||
 	    (sproto == IPPROTO_IPCOMP && !V_ipcomp_enable)) {
 		m_freem(m);
 		IPSEC_ISTAT(sproto, pdrops);
 		return EOPNOTSUPP;
 	}
 
 	if (m->m_pkthdr.len - skip < 2 * sizeof (u_int32_t)) {
 		m_freem(m);
 		IPSEC_ISTAT(sproto, hdrops);
 		DPRINTF(("%s: packet too small\n", __func__));
 		return EINVAL;
 	}
 
 	/* Retrieve the SPI from the relevant IPsec header */
 	if (sproto == IPPROTO_ESP)
 		m_copydata(m, skip, sizeof(u_int32_t), (caddr_t) &spi);
 	else if (sproto == IPPROTO_AH)
 		m_copydata(m, skip + sizeof(u_int32_t), sizeof(u_int32_t),
 		    (caddr_t) &spi);
 	else if (sproto == IPPROTO_IPCOMP) {
 		u_int16_t cpi;
 		m_copydata(m, skip + sizeof(u_int16_t), sizeof(u_int16_t),
 		    (caddr_t) &cpi);
 		spi = ntohl(htons(cpi));
 	}
 
 	/*
 	 * Find the SA and (indirectly) call the appropriate
 	 * kernel crypto routine. The resulting mbuf chain is a valid
 	 * IP packet ready to go through input processing.
 	 */
 	bzero(&dst_address, sizeof (dst_address));
 	dst_address.sa.sa_family = af;
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		dst_address.sin.sin_len = sizeof(struct sockaddr_in);
 		m_copydata(m, offsetof(struct ip, ip_dst),
 		    sizeof(struct in_addr),
 		    (caddr_t) &dst_address.sin.sin_addr);
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		dst_address.sin6.sin6_len = sizeof(struct sockaddr_in6);
 		m_copydata(m, offsetof(struct ip6_hdr, ip6_dst),
 		    sizeof(struct in6_addr),
 		    (caddr_t) &dst_address.sin6.sin6_addr);
 		/* We keep addresses in SADB without embedded scope id */
 		if (IN6_IS_SCOPE_LINKLOCAL(&dst_address.sin6.sin6_addr)) {
 			/* XXX: sa6_recoverscope() */
 			dst_address.sin6.sin6_scope_id =
 			    ntohs(dst_address.sin6.sin6_addr.s6_addr16[1]);
 			dst_address.sin6.sin6_addr.s6_addr16[1] = 0;
 		}
 		break;
 #endif /* INET6 */
 	default:
 		DPRINTF(("%s: unsupported protocol family %u\n", __func__, af));
 		m_freem(m);
 		IPSEC_ISTAT(sproto, nopf);
 		return EPFNOSUPPORT;
 	}
 
 	/* NB: only pass dst since key_allocsa follows RFC2401 */
 	sav = key_allocsa(&dst_address, sproto, spi);
 	if (sav == NULL) {
 		DPRINTF(("%s: no key association found for SA %s/%08lx/%u\n",
 		    __func__, ipsec_address(&dst_address, buf, sizeof(buf)),
 		    (u_long) ntohl(spi), sproto));
 		IPSEC_ISTAT(sproto, notdb);
 		m_freem(m);
 		return ENOENT;
 	}
 
 	if (sav->tdb_xform == NULL) {
 		DPRINTF(("%s: attempted to use uninitialized SA %s/%08lx/%u\n",
 		    __func__, ipsec_address(&dst_address, buf, sizeof(buf)),
 		    (u_long) ntohl(spi), sproto));
 		IPSEC_ISTAT(sproto, noxform);
 		key_freesav(&sav);
 		m_freem(m);
 		return ENXIO;
 	}
 
 	/*
 	 * Call appropriate transform and return -- callback takes care of
 	 * everything else.
 	 */
 	error = (*sav->tdb_xform->xf_input)(m, sav, skip, protoff);
 	return (error);
 }
 
 #ifdef INET
 /*
  * IPSEC_INPUT() method implementation for IPv4.
  *  0 - Permitted by inbound security policy for further processing.
  *  EACCES - Forbidden by inbound security policy.
  *  EINPROGRESS - consumed by IPsec.
  */
 int
 ipsec4_input(struct mbuf *m, int offset, int proto)
 {
 
 	switch (proto) {
 	case IPPROTO_AH:
 	case IPPROTO_ESP:
 	case IPPROTO_IPCOMP:
 		/* Do inbound IPsec processing for AH/ESP/IPCOMP */
 		ipsec_common_input(m, offset,
 		    offsetof(struct ip, ip_p), AF_INET, proto);
 		return (EINPROGRESS); /* mbuf consumed by IPsec */
 	default:
 		/*
 		 * Protocols with further headers get their IPsec treatment
 		 * within the protocol specific processing.
 		 */
 		switch (proto) {
 		case IPPROTO_ICMP:
 		case IPPROTO_IGMP:
 		case IPPROTO_IPV4:
 		case IPPROTO_IPV6:
 		case IPPROTO_RSVP:
 		case IPPROTO_GRE:
 		case IPPROTO_MOBILE:
 		case IPPROTO_ETHERIP:
 		case IPPROTO_PIM:
 		case IPPROTO_SCTP:
 			break;
 		default:
 			return (0);
 		}
 	};
 	/*
 	 * Enforce IPsec policy checking if we are seeing last header.
 	 */
 	if (ipsec4_in_reject(m, NULL) != 0) {
 		/* Forbidden by inbound security policy */
 		m_freem(m);
 		return (EACCES);
 	}
 	return (0);
 }
 
 int
 ipsec4_ctlinput(ipsec_ctlinput_param_t param)
 {
 	struct icmp *icp = param.icmp;
 	struct ip *ip = &icp->icmp_ip;
 	struct sockaddr_in icmpsrc = {
 		.sin_len = sizeof(struct sockaddr_in),
 		.sin_family = AF_INET,
 		.sin_addr = ip->ip_dst,
 	};
 	struct in_conninfo inc;
 	struct secasvar *sav;
 	uint32_t pmtu, spi;
 	uint32_t max_pmtu;
 	uint8_t proto;
 
 	pmtu = ntohs(icp->icmp_nextmtu);
 
 	if (pmtu < V_ip4_ipsec_min_pmtu)
 		return (EINVAL);
 
 	proto = ip->ip_p;
 	if (proto != IPPROTO_ESP && proto != IPPROTO_AH &&
 	    proto != IPPROTO_IPCOMP)
 		return (EINVAL);
 
 	memcpy(&spi, (caddr_t)ip + (ip->ip_hl << 2), sizeof(spi));
 	sav = key_allocsa((union sockaddr_union *)&icmpsrc, proto, spi);
 	if (sav == NULL)
 		return (ENOENT);
 
 	key_freesav(&sav);
 
 	memset(&inc, 0, sizeof(inc));
 	inc.inc_faddr = ip->ip_dst;
 
 	/* Update pmtu only if its smaller than the current one. */
 	max_pmtu = tcp_hc_getmtu(&inc);
 	if (max_pmtu == 0)
 		max_pmtu = tcp_maxmtu(&inc, NULL);
 
 	if (pmtu < max_pmtu)
 		tcp_hc_updatemtu(&inc, pmtu);
 
 	return (0);
 }
 
 /*
  * IPsec input callback for INET protocols.
  * This routine is called as the transform callback.
  * Takes care of filtering and other sanity checks on
  * the processed packet.
  */
 int
 ipsec4_common_input_cb(struct mbuf *m, struct secasvar *sav, int skip,
     int protoff)
 {
 	IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
 	struct epoch_tracker et;
 	struct ipsec_ctx_data ctx;
 	struct xform_history *xh;
 	struct secasindex *saidx;
 	struct m_tag *mtag;
 	struct ip *ip;
 	int error, prot, af, sproto, isr_prot;
 
 	IPSEC_ASSERT(sav != NULL, ("null SA"));
 	IPSEC_ASSERT(sav->sah != NULL, ("null SAH"));
 	saidx = &sav->sah->saidx;
 	af = saidx->dst.sa.sa_family;
 	IPSEC_ASSERT(af == AF_INET, ("unexpected af %u", af));
 	sproto = saidx->proto;
 	IPSEC_ASSERT(sproto == IPPROTO_ESP || sproto == IPPROTO_AH ||
 		sproto == IPPROTO_IPCOMP,
 		("unexpected security protocol %u", sproto));
 
 	if (skip != 0) {
 		/*
 		 * Fix IPv4 header
 		 */
 		if (m->m_len < skip && (m = m_pullup(m, skip)) == NULL) {
 			DPRINTF(("%s: processing failed for SA %s/%08lx\n",
 			    __func__, ipsec_address(&sav->sah->saidx.dst,
 			    buf, sizeof(buf)), (u_long) ntohl(sav->spi)));
 			IPSEC_ISTAT(sproto, hdrops);
 			error = ENOBUFS;
 			goto bad_noepoch;
 		}
 
 		ip = mtod(m, struct ip *);
 		ip->ip_len = htons(m->m_pkthdr.len);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(m, ip->ip_hl << 2);
 	} else {
 		ip = mtod(m, struct ip *);
 	}
 	prot = ip->ip_p;
 	/*
 	 * Check that we have NAT-T enabled and apply transport mode
 	 * decapsulation NAT procedure (RFC3948).
 	 * Do this before invoking into the PFIL.
 	 */
 	if (sav->natt != NULL &&
 	    (prot == IPPROTO_UDP || prot == IPPROTO_TCP))
 		udp_ipsec_adjust_cksum(m, sav, prot, skip);
 
 	/*
 	 * Needed for ipsec_run_hooks and netisr_queue_src
 	 */
 	NET_EPOCH_ENTER(et);
 
 	IPSEC_INIT_CTX(&ctx, &m, NULL, sav, AF_INET, IPSEC_ENC_BEFORE);
 	if ((error = ipsec_run_hhooks(&ctx, HHOOK_TYPE_IPSEC_IN)) != 0)
 		goto bad;
 	ip = mtod(m, struct ip *);	/* update pointer */
 
 	/* IP-in-IP encapsulation */
 	if (prot == IPPROTO_IPIP &&
 	    saidx->mode != IPSEC_MODE_TRANSPORT) {
 		if (m->m_pkthdr.len - skip < sizeof(struct ip)) {
 			IPSEC_ISTAT(sproto, hdrops);
 			error = EINVAL;
 			goto bad;
 		}
 		/* enc0: strip outer IPv4 header */
 		m_striphdr(m, 0, ip->ip_hl << 2);
 	}
 #ifdef INET6
 	/* IPv6-in-IP encapsulation. */
 	else if (prot == IPPROTO_IPV6 &&
 	    saidx->mode != IPSEC_MODE_TRANSPORT) {
 		if (m->m_pkthdr.len - skip < sizeof(struct ip6_hdr)) {
 			IPSEC_ISTAT(sproto, hdrops);
 			error = EINVAL;
 			goto bad;
 		}
 		/* enc0: strip IPv4 header, keep IPv6 header only */
 		m_striphdr(m, 0, ip->ip_hl << 2);
 	}
 #endif /* INET6 */
 	else if (prot != IPPROTO_IPV6 && saidx->mode == IPSEC_MODE_ANY) {
 		/*
 		 * When mode is wildcard, inner protocol is IPv6 and
 		 * we have no INET6 support - drop this packet a bit later.
 		 * In other cases we assume transport mode. Set prot to
 		 * correctly choose netisr.
 		 */
 		prot = IPPROTO_IPIP;
 	}
 
 	/*
 	 * Record what we've done to the packet (under what SA it was
 	 * processed).
 	 */
 	if (sproto != IPPROTO_IPCOMP) {
 		mtag = m_tag_get(PACKET_TAG_IPSEC_IN_DONE,
 		    sizeof(struct xform_history), M_NOWAIT);
 		if (mtag == NULL) {
 			DPRINTF(("%s: failed to get tag\n", __func__));
 			IPSEC_ISTAT(sproto, hdrops);
 			error = ENOMEM;
 			goto bad;
 		}
 
 		xh = (struct xform_history *)(mtag + 1);
 		bcopy(&saidx->dst, &xh->dst, saidx->dst.sa.sa_len);
 		xh->spi = sav->spi;
 		xh->proto = sproto;
 		xh->mode = saidx->mode;
 		m_tag_prepend(m, mtag);
 	}
 
 	key_sa_recordxfer(sav, m);		/* record data transfer */
 
 	/*
 	 * In transport mode requeue decrypted mbuf back to IPv4 protocol
 	 * handler. This is necessary to correctly expose rcvif.
 	 */
 	if (saidx->mode == IPSEC_MODE_TRANSPORT)
 		prot = IPPROTO_IPIP;
 	/*
 	 * Re-dispatch via software interrupt.
 	 */
 	switch (prot) {
 	case IPPROTO_IPIP:
 		isr_prot = NETISR_IP;
 		af = AF_INET;
 		break;
 #ifdef INET6
 	case IPPROTO_IPV6:
 		isr_prot = NETISR_IPV6;
 		af = AF_INET6;
 		break;
 #endif
 	default:
 		DPRINTF(("%s: cannot handle inner ip proto %d\n",
 			    __func__, prot));
 		IPSEC_ISTAT(sproto, nopf);
 		error = EPFNOSUPPORT;
 		goto bad;
 	}
 
 	IPSEC_INIT_CTX(&ctx, &m, NULL, sav, af, IPSEC_ENC_AFTER);
 	if ((error = ipsec_run_hhooks(&ctx, HHOOK_TYPE_IPSEC_IN)) != 0)
 		goto bad;
 
 	/* Handle virtual tunneling interfaces */
 	if (saidx->mode == IPSEC_MODE_TUNNEL)
 		error = ipsec_if_input(m, sav, af);
 	if (error == 0) {
 		error = netisr_queue_src(isr_prot, (uintptr_t)sav->spi, m);
 		if (error) {
 			IPSEC_ISTAT(sproto, qfull);
 			DPRINTF(("%s: queue full; proto %u packet dropped\n",
 			    __func__, sproto));
 		}
 	}
 	NET_EPOCH_EXIT(et);
 	key_freesav(&sav);
 	return (error);
 bad:
 	NET_EPOCH_EXIT(et);
 bad_noepoch:
 	key_freesav(&sav);
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 #endif /* INET */
 
 #ifdef INET6
 static bool
 ipsec6_lasthdr(int proto)
 {
 
 	switch (proto) {
 	case IPPROTO_IPV4:
 	case IPPROTO_IPV6:
 	case IPPROTO_GRE:
 	case IPPROTO_ICMPV6:
 	case IPPROTO_ETHERIP:
 	case IPPROTO_PIM:
 	case IPPROTO_SCTP:
 		return (true);
 	default:
 		return (false);
 	};
 }
 
 /*
  * IPSEC_INPUT() method implementation for IPv6.
  *  0 - Permitted by inbound security policy for further processing.
  *  EACCES - Forbidden by inbound security policy.
  *  EINPROGRESS - consumed by IPsec.
  */
 int
 ipsec6_input(struct mbuf *m, int offset, int proto)
 {
 
 	switch (proto) {
 	case IPPROTO_AH:
 	case IPPROTO_ESP:
 	case IPPROTO_IPCOMP:
 		/* Do inbound IPsec processing for AH/ESP/IPCOMP */
 		ipsec_common_input(m, offset,
 		    offsetof(struct ip6_hdr, ip6_nxt), AF_INET6, proto);
 		return (EINPROGRESS); /* mbuf consumed by IPsec */
 	default:
 		/*
 		 * Protocols with further headers get their IPsec treatment
 		 * within the protocol specific processing.
 		 */
 		if (!ipsec6_lasthdr(proto))
 			return (0);
 		/* FALLTHROUGH */
 	};
 	/*
 	 * Enforce IPsec policy checking if we are seeing last header.
 	 */
 	if (ipsec6_in_reject(m, NULL) != 0) {
 		/* Forbidden by inbound security policy */
 		m_freem(m);
 		return (EACCES);
 	}
 	return (0);
 }
 
 int
 ipsec6_ctlinput(ipsec_ctlinput_param_t param)
 {
 	return (0);
 }
 
 extern ipproto_input_t	*ip6_protox[];
 
 /*
  * IPsec input callback, called by the transform callback. Takes care of
  * filtering and other sanity checks on the processed packet.
  */
 int
 ipsec6_common_input_cb(struct mbuf *m, struct secasvar *sav, int skip,
     int protoff)
 {
 	IPSEC_DEBUG_DECLARE(char buf[IPSEC_ADDRSTRLEN]);
 	struct epoch_tracker et;
 	struct ipsec_ctx_data ctx;
 	struct xform_history *xh;
 	struct secasindex *saidx;
 	struct ip6_hdr *ip6;
 	struct m_tag *mtag;
 	int prot, af, sproto;
 	int nxt, isr_prot;
 	int error, nest;
 	uint8_t nxt8;
 
 	IPSEC_ASSERT(sav != NULL, ("null SA"));
 	IPSEC_ASSERT(sav->sah != NULL, ("null SAH"));
 	saidx = &sav->sah->saidx;
 	af = saidx->dst.sa.sa_family;
 	IPSEC_ASSERT(af == AF_INET6, ("unexpected af %u", af));
 	sproto = saidx->proto;
 	IPSEC_ASSERT(sproto == IPPROTO_ESP || sproto == IPPROTO_AH ||
 		sproto == IPPROTO_IPCOMP,
 		("unexpected security protocol %u", sproto));
 
 	NET_EPOCH_ENTER(et);
 
 	/* Fix IPv6 header */
 	if (m->m_len < sizeof(struct ip6_hdr) &&
 	    (m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
 		DPRINTF(("%s: processing failed for SA %s/%08lx\n",
 		    __func__, ipsec_address(&sav->sah->saidx.dst, buf,
 		    sizeof(buf)), (u_long) ntohl(sav->spi)));
 
 		IPSEC_ISTAT(sproto, hdrops);
 		error = EACCES;
 		goto bad;
 	}
 
 	IPSEC_INIT_CTX(&ctx, &m, NULL, sav, af, IPSEC_ENC_BEFORE);
 	if ((error = ipsec_run_hhooks(&ctx, HHOOK_TYPE_IPSEC_IN)) != 0)
 		goto bad;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(struct ip6_hdr));
 
 	/* Save protocol */
 	m_copydata(m, protoff, 1, &nxt8);
 	prot = nxt8;
 
 	/* IPv6-in-IP encapsulation */
 	if (prot == IPPROTO_IPV6 &&
 	    saidx->mode != IPSEC_MODE_TRANSPORT) {
 		if (m->m_pkthdr.len - skip < sizeof(struct ip6_hdr)) {
 			IPSEC_ISTAT(sproto, hdrops);
 			error = EINVAL;
 			goto bad;
 		}
 		/* ip6n will now contain the inner IPv6 header. */
 		m_striphdr(m, 0, skip);
 		skip = 0;
 	}
 #ifdef INET
 	/* IP-in-IP encapsulation */
 	else if (prot == IPPROTO_IPIP &&
 	    saidx->mode != IPSEC_MODE_TRANSPORT) {
 		if (m->m_pkthdr.len - skip < sizeof(struct ip)) {
 			IPSEC_ISTAT(sproto, hdrops);
 			error = EINVAL;
 			goto bad;
 		}
 		/* ipn will now contain the inner IPv4 header */
 		m_striphdr(m, 0, skip);
 		skip = 0;
 	}
 #endif /* INET */
 	else {
 		prot = IPPROTO_IPV6; /* for correct BPF processing */
 	}
 
 	/*
 	 * Record what we've done to the packet (under what SA it was
 	 * processed).
 	 */
 	if (sproto != IPPROTO_IPCOMP) {
 		mtag = m_tag_get(PACKET_TAG_IPSEC_IN_DONE,
 		    sizeof(struct xform_history), M_NOWAIT);
 		if (mtag == NULL) {
 			DPRINTF(("%s: failed to get tag\n", __func__));
 			IPSEC_ISTAT(sproto, hdrops);
 			error = ENOMEM;
 			goto bad;
 		}
 
 		xh = (struct xform_history *)(mtag + 1);
 		bcopy(&saidx->dst, &xh->dst, saidx->dst.sa.sa_len);
 		xh->spi = sav->spi;
 		xh->proto = sproto;
 		xh->mode = saidx->mode;
 		m_tag_prepend(m, mtag);
 	}
 
 	key_sa_recordxfer(sav, m);
 
 #ifdef INET
 	if (prot == IPPROTO_IPIP)
 		af = AF_INET;
 	else
 #endif
 		af = AF_INET6;
 	IPSEC_INIT_CTX(&ctx, &m, NULL, sav, af, IPSEC_ENC_AFTER);
 	if ((error = ipsec_run_hhooks(&ctx, HHOOK_TYPE_IPSEC_IN)) != 0)
 		goto bad;
 	if (skip == 0) {
 		/*
 		 * We stripped outer IPv6 header.
 		 * Now we should requeue decrypted packet via netisr.
 		 */
 		switch (prot) {
 #ifdef INET
 		case IPPROTO_IPIP:
 			isr_prot = NETISR_IP;
 			break;
 #endif
 		case IPPROTO_IPV6:
 			isr_prot = NETISR_IPV6;
 			break;
 		default:
 			DPRINTF(("%s: cannot handle inner ip proto %d\n",
 			    __func__, prot));
 			IPSEC_ISTAT(sproto, nopf);
 			error = EPFNOSUPPORT;
 			goto bad;
 		}
 		/* Handle virtual tunneling interfaces */
 		if (saidx->mode == IPSEC_MODE_TUNNEL)
 			error = ipsec_if_input(m, sav, af);
 		if (error == 0) {
 			error = netisr_queue_src(isr_prot,
 			    (uintptr_t)sav->spi, m);
 			if (error) {
 				IPSEC_ISTAT(sproto, qfull);
 				DPRINTF(("%s: queue full; proto %u packet"
 				    " dropped\n", __func__, sproto));
 			}
 		}
 		NET_EPOCH_EXIT(et);
 		key_freesav(&sav);
 		return (error);
 	}
 	/*
 	 * See the end of ip6_input for this logic.
 	 * IPPROTO_IPV[46] case will be processed just like other ones
 	 */
 	nest = 0;
 	nxt = nxt8;
 	while (nxt != IPPROTO_DONE) {
 		if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) {
 			IP6STAT_INC(ip6s_toomanyhdr);
 			error = EINVAL;
 			goto bad;
 		}
 
 		/*
 		 * Protection against faulty packet - there should be
 		 * more sanity checks in header chain processing.
 		 */
 		if (m->m_pkthdr.len < skip) {
 			IP6STAT_INC(ip6s_tooshort);
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated);
 			error = EINVAL;
 			goto bad;
 		}
 		/*
 		 * Enforce IPsec policy checking if we are seeing last header.
 		 * note that we do not visit this with protocols with pcb layer
 		 * code - like udp/tcp/raw ip.
 		 */
 		if (ipsec6_lasthdr(nxt) && ipsec6_in_reject(m, NULL)) {
 			error = EINVAL;
 			goto bad;
 		}
 		nxt = ip6_protox[nxt](&m, &skip, nxt);
 	}
 	NET_EPOCH_EXIT(et);
 	key_freesav(&sav);
 	return (0);
 bad:
 	NET_EPOCH_EXIT(et);
 	key_freesav(&sav);
 	if (m)
 		m_freem(m);
 	return (error);
 }
 #endif /* INET6 */
diff --git a/sys/netpfil/ipfilter/netinet/ip_compat.h b/sys/netpfil/ipfilter/netinet/ip_compat.h
index 170326dc33f1..8266f22a63a3 100644
--- a/sys/netpfil/ipfilter/netinet/ip_compat.h
+++ b/sys/netpfil/ipfilter/netinet/ip_compat.h
@@ -1,1243 +1,1244 @@
 /*
  * Copyright (C) 2012 by Darren Reed.
  *
  * See the IPFILTER.LICENCE file for details on licencing.
  *
  * @(#)ip_compat.h	1.8 1/14/96
  * $FreeBSD$
  * Id: ip_compat.h,v 2.142.2.57 2007/10/10 09:51:42 darrenr Exp $
  */
 
 #ifndef	__IP_COMPAT_H__
 #define	__IP_COMPAT_H__
 
 #if defined(_KERNEL) || defined(KERNEL) || defined(__KERNEL__)
 # undef	KERNEL
 # undef	_KERNEL
 # undef 	__KERNEL__
 # define	KERNEL
 # define	_KERNEL
 # define 	__KERNEL__
 #endif
 
 #ifndef	SOLARIS
 # if defined(sun) && defined(__SVR4)
 #  define	SOLARIS		1
 # else
 #  define	SOLARIS		0
 # endif
 #endif
 
 
 #if defined(__SVR4)
 # define index   strchr
 # if !defined(_KERNEL)
 #  define	bzero(a,b)	memset(a,0,b)
 #  define	bcmp		memcmp
 #  define	bcopy(a,b,c)	memmove(b,a,c)
 # endif
 #endif
 
 #ifndef LIFNAMSIZ
 # ifdef IF_NAMESIZE
 #  define	LIFNAMSIZ	IF_NAMESIZE
 # else
 #  ifdef	IFNAMSIZ
 #   define	LIFNAMSIZ	IFNAMSIZ
 #  else
 #   define	LIFNAMSIZ	16
 #  endif
 # endif
 #endif
 
 
 #  define IPL_EXTERN(ep) ipl##ep
 
 /*
  * This is a workaround for <sys/uio.h> troubles on FreeBSD and OpenBSD.
  */
 #ifndef _KERNEL
 # define ADD_KERNEL
 # define _KERNEL
 # define KERNEL
 #endif
 #include <sys/uio.h>
 #ifdef ADD_KERNEL
 # undef _KERNEL
 # undef KERNEL
 #endif
 
 #define	NETBSD_GE_REV(x)	(defined(__NetBSD_Version__) && \
 				 (__NetBSD_Version__ >= (x)))
 #define	NETBSD_GT_REV(x)	(defined(__NetBSD_Version__) && \
 				 (__NetBSD_Version__ > (x)))
 #define	NETBSD_LT_REV(x)	(defined(__NetBSD_Version__) && \
 				 (__NetBSD_Version__ < (x)))
 
 
 /* ----------------------------------------------------------------------- */
 /*                                F R E E B S D                            */
 /* ----------------------------------------------------------------------- */
 #define HAS_SYS_MD5_H	1
 #if defined(_KERNEL)
 # include "opt_bpf.h"
 # include "opt_inet6.h"
 # if defined(INET6) && !defined(USE_INET6)
 #  define USE_INET6
 # endif
 #else
 # if !defined(USE_INET6) && !defined(NOINET6)
 #  define	USE_INET6
 # endif
 #endif
 
 #if defined(_KERNEL)
 # include <netinet/ip_var.h>
 # define	p_cred	td_ucred
 # define	p_uid	td_ucred->cr_ruid
 
 /*
  * When #define'd, the 5.2.1 kernel panics when used with the ftp proxy.
  * There may be other, safe, kernels but this is not extensively tested yet.
  */
 # define HAVE_M_PULLDOWN
 # if !defined(IPFILTER_LKM) && defined(__FreeBSD__)
 #  include "opt_ipfilter.h"
 # endif
 # define	COPYIN(a,b,c)	copyin((caddr_t)(a), (caddr_t)(b), (c))
 # define	COPYOUT(a,b,c)	copyout((caddr_t)(a), (caddr_t)(b), (c))
 
 #else
 # include <inttypes.h>
 #endif /* _KERNEL */
 
 #include <sys/selinfo.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #define	KRWLOCK_FILL_SZ		56
 #define	KMUTEX_FILL_SZ		56
 #include <sys/rwlock.h>
 #define	KMUTEX_T		struct mtx
 #define	KRWLOCK_T		struct rwlock
 
 #ifdef _KERNEL
 # define	READ_ENTER(x)		rw_rlock(&(x)->ipf_lk)
 # define	WRITE_ENTER(x)		rw_wlock(&(x)->ipf_lk)
 # define	MUTEX_DOWNGRADE(x)	rw_downgrade(&(x)->ipf_lk)
 # define	MUTEX_TRY_UPGRADE(x)	rw_try_upgrade(&(x)->ipf_lk)
 # define	RWLOCK_INIT(x,y)	rw_init(&(x)->ipf_lk, (y))
 # define	RW_DESTROY(x)		rw_destroy(&(x)->ipf_lk)
 # define	RWLOCK_EXIT(x)		do { \
 					    if (rw_wowned(&(x)->ipf_lk)) \
 					    	rw_wunlock(&(x)->ipf_lk); \
 					    else \
 						rw_runlock(&(x)->ipf_lk); \
 					} while (0)
 # include <net/if_var.h>
+# include <net/if_private.h>
 # define	GETKTIME(x)	microtime((struct timeval *)x)
 # define	if_addrlist	if_addrhead
 
 # include <netinet/in_systm.h>
 # include <netinet/ip.h>
 # include <machine/in_cksum.h>
 
 # define	USE_MUTEXES
 # define	MUTEX_ENTER(x)		mtx_lock(&(x)->ipf_lk)
 # define	MUTEX_EXIT(x)		mtx_unlock(&(x)->ipf_lk)
 # define	MUTEX_INIT(x,y)		mtx_init(&(x)->ipf_lk, (y), NULL,\
 						 MTX_DEF)
 # define	MUTEX_DESTROY(x)	mtx_destroy(&(x)->ipf_lk)
 # define	MUTEX_NUKE(x)		bzero((x), sizeof(*(x)))
 /*
  * Whilst the sx(9) locks on FreeBSD have the right semantics and interface
  * for what we want to use them for, despite testing showing they work -
  * with a WITNESS kernel, it generates LOR messages.
  */
 # include <machine/atomic.h>
 # define	ATOMIC_INC(x)		{ mtx_lock(&softc->ipf_rw.ipf_lk); (x)++; \
 					  mtx_unlock(&softc->ipf_rw.ipf_lk); }
 # define	ATOMIC_DEC(x)		{ mtx_lock(&softc->ipf_rw.ipf_lk); (x)--; \
 					  mtx_unlock(&softc->ipf_rw.ipf_lk); }
 # define	ATOMIC_INCL(x)		atomic_add_long(&(x), 1)
 # define	ATOMIC_INC64(x)		ATOMIC_INC(x)
 # define	ATOMIC_INC32(x)		atomic_add_32((u_int *)&(x), 1)
 # define	ATOMIC_DECL(x)		atomic_add_long(&(x), -1)
 # define	ATOMIC_DEC64(x)		ATOMIC_DEC(x)
 # define	ATOMIC_DEC32(x)		atomic_add_32((u_int *)&(x), -1)
 # define	SPL_X(x)	;
 # define	SPL_NET(x)	;
 # define	SPL_IMP(x)	;
 # define	SPL_SCHED(x)	;
 # define	GET_MINOR		dev2unit
 # define	MSGDSIZE(m)	mbufchainlen(m)
 # define	M_LEN(m)	(m)->m_len
 # define	M_ADJ(m,x)	m_adj(m, x)
 # define	M_COPY(x)	m_copym((x), 0, M_COPYALL, M_NOWAIT)
 # define	M_DUP(m)	m_dup(m, M_NOWAIT)
 # define	IPF_PANIC(x,y)	if (x) { printf y; panic("ipf_panic"); }
 typedef struct mbuf mb_t;
 
 #else	/* !_KERNEL */
 # ifndef _NET_IF_VAR_H_
 /*
  * Userland emulation of struct ifnet.
  */
 struct route;
 struct mbuf;
 struct ifnet {
 	char			if_xname[IFNAMSIZ];
 	STAILQ_HEAD(, ifaddr)	if_addrlist;
 	int	(*if_output)(struct ifnet *, struct mbuf *,
 	    const struct sockaddr *, struct route *);
 };
 # endif /* _NET_IF_VAR_H_ */
 #endif /* _KERNEL */
 
 #define	IFNAME(x)	((struct ifnet *)x)->if_xname
 #define	COPYIFNAME(v, x, b) \
 			(void) strncpy(b, \
 			       ((struct ifnet *)x)->if_xname, \
 			       LIFNAMSIZ)
 
 typedef	u_long		ioctlcmd_t;
 typedef	struct uio	uio_t;
 typedef	int		minor_t;
 typedef	u_int32_t	u_32_t;
 # define	U_32_T	1
 
 
 /* ----------------------------------------------------------------------- */
 /*                           G E N E R I C                                 */
 /* ----------------------------------------------------------------------- */
 
 /*
  * For BSD kernels, if bpf is in the kernel, enable ipfilter to use bpf in
  * filter rules.
  */
 #if !defined(IPFILTER_BPF)
 # if (defined(NBPF) && (NBPF > 0)) || (defined(DEV_BPF) && (DEV_BPF > 0)) || \
      (defined(NBPFILTER) && (NBPFILTER > 0))
 #  define	IPFILTER_BPF
 # endif
 #endif
 
 /*
  * Userland locking primitives
  */
 #ifndef _KERNEL
 # if !defined(KMUTEX_FILL_SZ)
 #  define	KMUTEX_FILL_SZ	1
 # endif
 # if !defined(KRWLOCK_FILL_SZ)
 #  define	KRWLOCK_FILL_SZ	1
 # endif
 #endif
 
 typedef	struct	{
 	char	*eMm_owner;
 	char	*eMm_heldin;
 	u_int	eMm_magic;
 	int	eMm_held;
 	int	eMm_heldat;
 } eMmutex_t;
 
 typedef	struct	{
 	char	*eMrw_owner;
 	char	*eMrw_heldin;
 	u_int	eMrw_magic;
 	short	eMrw_read;
 	short	eMrw_write;
 	int	eMrw_heldat;
 } eMrwlock_t;
 
 typedef union {
 	char	_fill[KMUTEX_FILL_SZ];
 #ifdef KMUTEX_T
 	struct	{
 		KMUTEX_T	ipf_slk;
 		const char	*ipf_lname;
 	} ipf_lkun_s;
 #endif
 	eMmutex_t	ipf_emu;
 } ipfmutex_t;
 
 typedef union {
 	char	_fill[KRWLOCK_FILL_SZ];
 #ifdef KRWLOCK_T
 	struct	{
 		KRWLOCK_T	ipf_slk;
 		const char	*ipf_lname;
 		int		ipf_sr;
 		int		ipf_sw;
 		u_int		ipf_magic;
 	} ipf_lkun_s;
 #endif
 	eMrwlock_t	ipf_emu;
 } ipfrwlock_t;
 
 #define	ipf_lk		ipf_lkun_s.ipf_slk
 #define	ipf_lname	ipf_lkun_s.ipf_lname
 #define	ipf_isr		ipf_lkun_s.ipf_sr
 #define	ipf_isw		ipf_lkun_s.ipf_sw
 #define	ipf_magic	ipf_lkun_s.ipf_magic
 
 #if defined(__FreeBSD__) && defined(_KERNEL)
      CTASSERT(sizeof(ipfrwlock_t) == KRWLOCK_FILL_SZ);
      CTASSERT(sizeof(ipfmutex_t) == KMUTEX_FILL_SZ);
 #endif
 
 
 /*
  * In a non-kernel environment, there are a lot of macros that need to be
  * filled in to be null-ops or to point to some compatibility function,
  * somewhere in userland.
  */
 #ifndef _KERNEL
 typedef	struct	mb_s	{
 	struct	mb_s	*mb_next;
 	char		*mb_data;
 	void		*mb_ifp;
 	int		mb_len;
 	int		mb_flags;
 	u_long		mb_buf[2048];
 } mb_t;
 # undef		m_next
 # define	m_next		mb_next
 # undef		m_len
 # define	m_len		mb_len
 # undef		m_flags
 # define	m_flags		mb_flags
 # undef		m_data
 # define	m_data		mb_data
 # undef		M_MCAST
 # define	M_MCAST		0x01
 # undef		M_BCAST
 # define	M_BCAST		0x02
 # undef		M_MBCAST
 # define	M_MBCAST	0x04
 # define	MSGDSIZE(m)	msgdsize(m)
 # define	M_LEN(m)	(m)->mb_len
 # define	M_ADJ(m,x)	(m)->mb_len += x
 # define	M_COPY(m)	dupmbt(m)
 # define	M_DUP(m)	dupmbt(m)
 # define	GETKTIME(x)	gettimeofday((struct timeval *)(x), NULL)
 # define	MTOD(m, t)	((t)(m)->mb_data)
 # define	FREE_MB_T(m)	freembt(m)
 # define	ALLOC_MB_T(m,l)	(m) = allocmbt(l)
 # define	PREP_MB_T(f, m)	do { \
 						(m)->mb_next = *(f)->fin_mp; \
 						*(fin)->fin_mp = (m); \
 						(f)->fin_m = (m); \
 					} while (0)
 # define	SLEEP(x,y)	1;
 # define	WAKEUP(x,y)	;
 # define	POLLWAKEUP(y)	;
 # define	IPF_PANIC(x,y)	;
 # define	PANIC(x,y)	;
 # define	SPL_SCHED(x)	;
 # define	SPL_NET(x)	;
 # define	SPL_IMP(x)	;
 # define	SPL_X(x)	;
 # define	KMALLOC(a,b)	(a) = (b)malloc(sizeof(*a))
 # define	KMALLOCS(a,b,c)	(a) = (b)malloc(c)
 # define	KFREE(x)	free(x)
 # define	KFREES(x,s)	free(x)
 # define	GETIFP(x, v)	get_unit(x,v)
 # define	GETIFMTU_4(x)	2048
 # define	GETIFMTU_6(x)	2048
 # define	COPYIN(a,b,c)	bcopywrap((a), (b), (c))
 # define	COPYOUT(a,b,c)	bcopywrap((a), (b), (c))
 # define	COPYDATA(m, o, l, b)	bcopy(MTOD((mb_t *)m, char *) + (o), \
 					      (b), (l))
 # define	COPYBACK(m, o, l, b)	bcopy((b), \
 					      MTOD((mb_t *)m, char *) + (o), \
 					      (l))
 # define	UIOMOVE(a,b,c,d)	ipfuiomove((caddr_t)a,b,c,d)
 extern	void	m_copydata(mb_t *, int, int, caddr_t);
 extern	int	ipfuiomove(caddr_t, int, int, struct uio *);
 extern	int	bcopywrap(void *, void *, size_t);
 extern	mb_t	*allocmbt(size_t);
 extern	mb_t	*dupmbt(mb_t *);
 extern	void	freembt(mb_t *);
 
 # define	MUTEX_DESTROY(x)	eMmutex_destroy(&(x)->ipf_emu, \
 							__FILE__, __LINE__)
 # define	MUTEX_ENTER(x)		eMmutex_enter(&(x)->ipf_emu, \
 						      __FILE__, __LINE__)
 # define	MUTEX_EXIT(x)		eMmutex_exit(&(x)->ipf_emu, \
 						     __FILE__, __LINE__)
 # define	MUTEX_INIT(x,y)		eMmutex_init(&(x)->ipf_emu, y, \
 						     __FILE__, __LINE__)
 # define	MUTEX_NUKE(x)		bzero((x), sizeof(*(x)))
 
 # define	MUTEX_DOWNGRADE(x)	eMrwlock_downgrade(&(x)->ipf_emu, \
 							   __FILE__, __LINE__)
 # define	MUTEX_TRY_UPGRADE(x)	eMrwlock_try_upgrade(&(x)->ipf_emu, \
 							   __FILE__, __LINE__)
 # define	READ_ENTER(x)		eMrwlock_read_enter(&(x)->ipf_emu, \
 							    __FILE__, __LINE__)
 # define	RWLOCK_INIT(x, y)	eMrwlock_init(&(x)->ipf_emu, y)
 # define	RWLOCK_EXIT(x)		eMrwlock_exit(&(x)->ipf_emu)
 # define	RW_DESTROY(x)		eMrwlock_destroy(&(x)->ipf_emu)
 # define	WRITE_ENTER(x)		eMrwlock_write_enter(&(x)->ipf_emu, \
 							     __FILE__, \
 							     __LINE__)
 
 # define	USE_MUTEXES		1
 
 extern void eMmutex_destroy(eMmutex_t *, char *, int);
 extern void eMmutex_enter(eMmutex_t *, char *, int);
 extern void eMmutex_exit(eMmutex_t *, char *, int);
 extern void eMmutex_init(eMmutex_t *, char *, char *, int);
 extern void eMrwlock_destroy(eMrwlock_t *);
 extern void eMrwlock_exit(eMrwlock_t *);
 extern void eMrwlock_init(eMrwlock_t *, char *);
 extern void eMrwlock_read_enter(eMrwlock_t *, char *, int);
 extern void eMrwlock_write_enter(eMrwlock_t *, char *, int);
 extern void eMrwlock_downgrade(eMrwlock_t *, char *, int);
 
 #endif
 
 extern	mb_t	*allocmbt(size_t);
 
 #define	MAX_IPV4HDR	((0xf << 2) + sizeof(struct icmp) + sizeof(ip_t) + 8)
 
 #ifndef	IP_OFFMASK
 # define	IP_OFFMASK	0x1fff
 #endif
 
 
 /*
  * On BSD's use quad_t as a guarantee for getting at least a 64bit sized
  * object.
  */
 #if !defined(__amd64__) && !SOLARIS
 # define	USE_QUAD_T
 # define	U_QUAD_T	unsigned long long
 # define	QUAD_T		long long
 #else /* BSD  */
 # if !defined(U_QUAD_T)
 #  define	U_QUAD_T	u_long
 #  define	QUAD_T		long
 # endif
 #endif /* BSD */
 
 
 #ifdef	USE_INET6
 # if defined(__NetBSD__) || defined(__FreeBSD__)
 #  include <netinet/ip6.h>
 #  include <netinet/icmp6.h>
 #   if defined(_KERNEL)
 #    include <netinet6/ip6_var.h>
 #   endif
 typedef	struct ip6_hdr	ip6_t;
 # endif
 #endif
 
 #ifndef	MAX
 # define	MAX(a,b)	(((a) > (b)) ? (a) : (b))
 #endif
 
 #if defined(_KERNEL)
 # if SOLARIS && !defined(INSTANCES)
 #  define	COPYDATA	mb_copydata
 #  define	COPYBACK	mb_copyback
 # else
 #  define	COPYDATA	m_copydata
 #  define	COPYBACK	m_copyback
 # endif
 # if (defined(__NetBSD_Version__) && (__NetBSD_Version__ < 105180000)) || \
       defined(__FreeBSD__)
 #  include <vm/vm.h>
 # endif
 # if NETBSD_GE_REV(105180000)
 #  include <uvm/uvm_extern.h>
 # else
 #  include <vm/vm_extern.h>
 extern  vm_map_t        kmem_map;
 # endif
 # include <sys/proc.h>
 
 # ifdef IPFILTER_M_IPFILTER
 #  include <sys/malloc.h>
 MALLOC_DECLARE(M_IPFILTER);
 #  define	_M_IPF		M_IPFILTER
 # else /* IPFILTER_M_IPFILTER */
 #  ifdef M_PFIL
 #   define	_M_IPF		M_PFIL
 #  else
 #   ifdef M_IPFILTER
 #    define	_M_IPF		M_IPFILTER
 #   else
 #    define	_M_IPF		M_TEMP
 #   endif /* M_IPFILTER */
 #  endif /* M_PFIL */
 # endif /* IPFILTER_M_IPFILTER */
 # if !defined(KMALLOC)
 #  define	KMALLOC(a, b)		(a) = (b)malloc(sizeof(*(a)), _M_IPF, M_NOWAIT)
 # endif
 # if !defined(KMALLOCS)
 #  define	KMALLOCS(a, b, c)	(a) = (b)malloc((c), _M_IPF, M_NOWAIT)
 # endif
 # if !defined(KFREE)
 #  define	KFREE(x)	free((x), _M_IPF)
 # endif
 # if !defined(KFREES)
 #  define	KFREES(x,s)	free((x), _M_IPF)
 # endif
 # define	UIOMOVE(a,b,c,d)	uiomove((caddr_t)a,b,d)
 # define	SLEEP(id, n)	tsleep((id), PPAUSE|PCATCH, n, 0)
 # define	WAKEUP(id,x)	wakeup(id+x)
 # if !defined(POLLWAKEUP)
 #  define	POLLWAKEUP(x)	selwakeup(softc->ipf_selwait+x)
 # endif
 # define	GETIFP(n, v)	ifunit(n)
 # define	GETIFMTU_4(x)	((struct ifnet *)x)->if_mtu
 # define	GETIFMTU_6(x)	((struct ifnet *)x)->if_mtu
 
 # if !defined(USE_MUTEXES) && !defined(SPL_NET)
 #  define	SPL_IMP(x)	x = splimp()
 #  define	SPL_NET(x)	x = splnet()
 #  if !defined(SPL_SCHED)
 #   define	SPL_SCHED(x)	x = splsched()
 #  endif
 #  define	SPL_X(x)	(void) splx(x)
 # endif /* !USE_MUTEXES */
 
 # ifndef FREE_MB_T
 #  define	FREE_MB_T(m)	m_freem(m)
 # endif
 # ifndef ALLOC_MB_T
 #  ifdef MGETHDR
 #   define	ALLOC_MB_T(m,l)	do { \
 					MGETHDR((m), M_NOWAIT, MT_HEADER); \
 					if ((m) != NULL) { \
 						(m)->m_len = (l); \
 						(m)->m_pkthdr.len = (l); \
 					} \
 				} while (0)
 #  else
 #   define	ALLOC_MB_T(m,l)	do { \
 					MGET((m), M_NOWAIT, MT_HEADER); \
 					if ((m) != NULL) { \
 						(m)->m_len = (l); \
 						(m)->m_pkthdr.len = (l); \
 					} \
 				} while (0)
 #  endif
 # endif
 # ifndef PREP_MB_T
 #  define	PREP_MB_T(f, m)	do { \
 						mb_t *_o = *(f)->fin_mp; \
 						(m)->m_next = _o; \
 						*(fin)->fin_mp = (m); \
 						if (_o->m_flags & M_PKTHDR) { \
 							(m)->m_pkthdr.len += \
 							    _o->m_pkthdr.len; \
 							(m)->m_pkthdr.rcvif = \
 							  _o->m_pkthdr.rcvif; \
 						} \
 					} while (0)
 # endif
 # ifndef M_DUP
 #  ifdef M_COPYALL
 #   define	M_DUP(m)	m_dup(m, 0, M_COPYALL, 0)
 #  else
 #   define	M_DUP(m)	m_dup(m)
 #  endif
 # endif
 
 # ifndef MTOD
 #  define	MTOD(m,t)	mtod(m,t)
 # endif
 
 # ifndef COPYIN
 #  define	COPYIN(a,b,c)	(bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0)
 #  define	COPYOUT(a,b,c)	(bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0)
 # endif
 
 # if SOLARIS && !defined(KMALLOC)
 #  define	KMALLOC(a,b)	(a) = (b)new_kmem_alloc(sizeof(*(a)), \
 							KMEM_NOSLEEP)
 #  define	KMALLOCS(a,b,c)	(a) = (b)new_kmem_alloc((c), KMEM_NOSLEEP)
 # endif
 
 # ifndef	GET_MINOR
 #  define	GET_MINOR(x)	dev2unit(x)
 # endif
 # define	PANIC(x,y)	if (x) panic y
 #endif /* _KERNEL */
 
 #if !defined(IFNAME) && !defined(_KERNEL)
 # define	IFNAME(x)	get_ifname((struct ifnet *)x)
 #endif
 #ifndef	COPYIFNAME
 # define	NEED_FRGETIFNAME
 extern	char	*ipf_getifname(struct ifnet *, char *);
 # define	COPYIFNAME(v, x, b) \
 				ipf_getifname((struct ifnet *)x, b)
 #endif
 
 #ifndef ASSERT
 # ifdef _KERNEL
 #  define	ASSERT(x)
 # else
 #  define	ASSERT(x)	do { if (!(x)) abort(); } while (0)
 # endif
 #endif
 
 #ifndef BCOPYIN
 #  define	BCOPYIN(a,b,c)	(bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0)
 #  define	BCOPYOUT(a,b,c)	(bcopy((caddr_t)(a), (caddr_t)(b), (c)), 0)
 #endif
 
 /*
  * Because the ctype(3) posix definition, if used "safely" in code everywhere,
  * would mean all normal code that walks through strings needed casts.  Yuck.
  */
 #define	ISALNUM(x)	isalnum((u_char)(x))
 #define	ISALPHA(x)	isalpha((u_char)(x))
 #define	ISDIGIT(x)	isdigit((u_char)(x))
 #define	ISSPACE(x)	isspace((u_char)(x))
 #define	ISUPPER(x)	isupper((u_char)(x))
 #define	ISXDIGIT(x)	isxdigit((u_char)(x))
 #define	ISLOWER(x)	islower((u_char)(x))
 #define	TOUPPER(x)	toupper((u_char)(x))
 #define	TOLOWER(x)	tolower((u_char)(x))
 
 /*
  * If mutexes aren't being used, turn all the mutex functions into null-ops.
  */
 #if !defined(USE_MUTEXES)
 # define	USE_SPL			1
 # undef		RW_DESTROY
 # undef		MUTEX_INIT
 # undef		MUTEX_NUKE
 # undef		MUTEX_DESTROY
 # define	MUTEX_ENTER(x)		;
 # define	READ_ENTER(x)		;
 # define	WRITE_ENTER(x)		;
 # define	MUTEX_DOWNGRADE(x)	;
 # define	MUTEX_TRY_UPGRADE(x)	;
 # define	RWLOCK_INIT(x, y)	;
 # define	RWLOCK_EXIT(x)		;
 # define	RW_DESTROY(x)		;
 # define	MUTEX_EXIT(x)		;
 # define	MUTEX_INIT(x,y)		;
 # define	MUTEX_DESTROY(x)	;
 # define	MUTEX_NUKE(x)		;
 #endif /* !USE_MUTEXES */
 #ifndef	ATOMIC_INC
 # define	ATOMIC_INC(x)		(x)++
 # define	ATOMIC_DEC(x)		(x)--
 #endif
 
 #if defined(USE_SPL) && defined(_KERNEL)
 # define	SPL_INT(x)	int x
 #else
 # define	SPL_INT(x)
 #endif
 
 /*
  * If there are no atomic operations for bit sizes defined, define them to all
  * use a generic one that works for all sizes.
  */
 #ifndef	ATOMIC_INCL
 # define	ATOMIC_INCL		ATOMIC_INC
 # define	ATOMIC_INC64		ATOMIC_INC
 # define	ATOMIC_INC32		ATOMIC_INC
 # define	ATOMIC_DECL		ATOMIC_DEC
 # define	ATOMIC_DEC64		ATOMIC_DEC
 # define	ATOMIC_DEC32		ATOMIC_DEC
 #endif
 
 #ifndef HDR_T_PRIVATE
 typedef	struct	tcphdr	tcphdr_t;
 typedef	struct	udphdr	udphdr_t;
 #endif
 typedef	struct	icmp	icmphdr_t;
 typedef	struct	ip	ip_t;
 typedef	struct	ether_header	ether_header_t;
 typedef	struct	tcpiphdr	tcpiphdr_t;
 
 #ifndef	FR_GROUPLEN
 # define	FR_GROUPLEN	16
 #endif
 
 #ifndef offsetof
 # define offsetof(t,m) (size_t)((&((t *)0L)->m))
 #endif
 #ifndef stsizeof
 # define stsizeof(t,m)	sizeof(((t *)0L)->m)
 #endif
 
 /*
  * This set of macros has been brought about because on Tru64 it is not
  * possible to easily assign or examine values in a structure that are
  * bit fields.
  */
 #ifndef IP_V
 # define	IP_V(x)		(x)->ip_v
 #endif
 #ifndef	IP_V_A
 # define	IP_V_A(x,y)	(x)->ip_v = (y)
 #endif
 #ifndef	IP_HL
 # define	IP_HL(x)	(x)->ip_hl
 #endif
 #ifndef	IP_HL_A
 # define	IP_HL_A(x,y)	(x)->ip_hl = ((y) & 0xf)
 #endif
 #ifndef	TCP_X2
 # define	TCP_X2(x)	(x)->th_x2
 #endif
 #ifndef	TCP_X2_A
 # define	TCP_X2_A(x,y)	(x)->th_x2 = (y)
 #endif
 #ifndef	TCP_OFF
 # define	TCP_OFF(x)	(x)->th_off
 #endif
 #ifndef	TCP_OFF_A
 # define	TCP_OFF_A(x,y)	(x)->th_off = (y)
 #endif
 #define	IPMINLEN(i, h)	((i)->ip_len >= (IP_HL(i) * 4 + sizeof(struct h)))
 
 #define	TCPF_ALL	(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|\
 			 TH_ECN|TH_CWR)
 
 #if !SOLARIS && !defined(m_act)
 # define	m_act	m_nextpkt
 #endif
 
 /*
  * Security Options for Intenet Protocol (IPSO) as defined in RFC 1108.
  *
  * Basic Option
  *
  * 00000001   -   (Reserved 4)
  * 00111101   -   Top Secret
  * 01011010   -   Secret
  * 10010110   -   Confidential
  * 01100110   -   (Reserved 3)
  * 11001100   -   (Reserved 2)
  * 10101011   -   Unclassified
  * 11110001   -   (Reserved 1)
  */
 #define	IPSO_CLASS_RES4		0x01
 #define	IPSO_CLASS_TOPS		0x3d
 #define	IPSO_CLASS_SECR		0x5a
 #define	IPSO_CLASS_CONF		0x96
 #define	IPSO_CLASS_RES3		0x66
 #define	IPSO_CLASS_RES2		0xcc
 #define	IPSO_CLASS_UNCL		0xab
 #define	IPSO_CLASS_RES1		0xf1
 
 #define	IPSO_AUTH_GENSER	0x80
 #define	IPSO_AUTH_ESI		0x40
 #define	IPSO_AUTH_SCI		0x20
 #define	IPSO_AUTH_NSA		0x10
 #define	IPSO_AUTH_DOE		0x08
 #define	IPSO_AUTH_UN		0x06
 #define	IPSO_AUTH_FTE		0x01
 
 /*
  * IP option #defines
  */
 #undef	IPOPT_RR
 #define	IPOPT_RR	7
 #undef	IPOPT_ZSU
 #define	IPOPT_ZSU	10	/* ZSU */
 #undef	IPOPT_MTUP
 #define	IPOPT_MTUP	11	/* MTUP */
 #undef	IPOPT_MTUR
 #define	IPOPT_MTUR	12	/* MTUR */
 #undef	IPOPT_ENCODE
 #define	IPOPT_ENCODE	15	/* ENCODE */
 #undef	IPOPT_TS
 #define	IPOPT_TS	68
 #undef	IPOPT_TR
 #define	IPOPT_TR	82	/* TR */
 #undef	IPOPT_SECURITY
 #define	IPOPT_SECURITY	130
 #undef	IPOPT_LSRR
 #define	IPOPT_LSRR	131
 #undef	IPOPT_E_SEC
 #define	IPOPT_E_SEC	133	/* E-SEC */
 #undef	IPOPT_CIPSO
 #define	IPOPT_CIPSO	134	/* CIPSO */
 #undef	IPOPT_SATID
 #define	IPOPT_SATID	136
 #ifndef	IPOPT_SID
 # define	IPOPT_SID	IPOPT_SATID
 #endif
 #undef	IPOPT_SSRR
 #define	IPOPT_SSRR	137
 #undef	IPOPT_ADDEXT
 #define	IPOPT_ADDEXT	147	/* ADDEXT */
 #undef	IPOPT_VISA
 #define	IPOPT_VISA	142	/* VISA */
 #undef	IPOPT_IMITD
 #define	IPOPT_IMITD	144	/* IMITD */
 #undef	IPOPT_EIP
 #define	IPOPT_EIP	145	/* EIP */
 #undef	IPOPT_RTRALRT
 #define	IPOPT_RTRALRT	148	/* RTRALRT */
 #undef	IPOPT_SDB
 #define	IPOPT_SDB	149
 #undef	IPOPT_NSAPA
 #define	IPOPT_NSAPA	150
 #undef	IPOPT_DPS
 #define	IPOPT_DPS	151
 #undef	IPOPT_UMP
 #define	IPOPT_UMP	152
 #undef	IPOPT_FINN
 #define	IPOPT_FINN	205	/* FINN */
 #undef	IPOPT_AH
 #define	IPOPT_AH	256+IPPROTO_AH
 
 #define	ICMP_UNREACH_ADMIN_PROHIBIT	ICMP_UNREACH_FILTER_PROHIB
 #define	ICMP_UNREACH_FILTER	ICMP_UNREACH_FILTER_PROHIB
 
 #ifndef	IPVERSION
 # define	IPVERSION	4
 #endif
 #ifndef	IPOPT_MINOFF
 # define	IPOPT_MINOFF	4
 #endif
 #ifndef	IPOPT_COPIED
 # define	IPOPT_COPIED(x)	((x)&0x80)
 #endif
 #ifndef	IPOPT_EOL
 # define	IPOPT_EOL	0
 #endif
 #ifndef	IPOPT_NOP
 # define	IPOPT_NOP	1
 #endif
 #ifndef	IP_MF
 # define	IP_MF	((u_short)0x2000)
 #endif
 #ifndef	ETHERTYPE_IP
 # define	ETHERTYPE_IP	((u_short)0x0800)
 #endif
 #ifndef	TH_FIN
 # define	TH_FIN	0x01
 #endif
 #ifndef	TH_SYN
 # define	TH_SYN	0x02
 #endif
 #ifndef	TH_RST
 # define	TH_RST	0x04
 #endif
 #ifndef	TH_PUSH
 # define	TH_PUSH	0x08
 #endif
 #ifndef	TH_ACK
 # define	TH_ACK	0x10
 #endif
 #ifndef	TH_URG
 # define	TH_URG	0x20
 #endif
 #undef	TH_ACKMASK
 #define	TH_ACKMASK	(TH_FIN|TH_SYN|TH_RST|TH_ACK)
 
 #ifndef	IPOPT_EOL
 # define	IPOPT_EOL	0
 #endif
 #ifndef	IPOPT_NOP
 # define	IPOPT_NOP	1
 #endif
 #ifndef	IPOPT_RR
 # define	IPOPT_RR	7
 #endif
 #ifndef	IPOPT_TS
 # define	IPOPT_TS	68
 #endif
 #ifndef	IPOPT_SECURITY
 # define	IPOPT_SECURITY	130
 #endif
 #ifndef	IPOPT_LSRR
 # define	IPOPT_LSRR	131
 #endif
 #ifndef	IPOPT_SATID
 # define	IPOPT_SATID	136
 #endif
 #ifndef	IPOPT_SSRR
 # define	IPOPT_SSRR	137
 #endif
 #ifndef	IPOPT_SECUR_UNCLASS
 # define	IPOPT_SECUR_UNCLASS	((u_short)0x0000)
 #endif
 #ifndef	IPOPT_SECUR_CONFID
 # define	IPOPT_SECUR_CONFID	((u_short)0xf135)
 #endif
 #ifndef	IPOPT_SECUR_EFTO
 # define	IPOPT_SECUR_EFTO	((u_short)0x789a)
 #endif
 #ifndef	IPOPT_SECUR_MMMM
 # define	IPOPT_SECUR_MMMM	((u_short)0xbc4d)
 #endif
 #ifndef	IPOPT_SECUR_RESTR
 # define	IPOPT_SECUR_RESTR	((u_short)0xaf13)
 #endif
 #ifndef	IPOPT_SECUR_SECRET
 # define	IPOPT_SECUR_SECRET	((u_short)0xd788)
 #endif
 #ifndef IPOPT_SECUR_TOPSECRET
 # define	IPOPT_SECUR_TOPSECRET	((u_short)0x6bc5)
 #endif
 #ifndef IPOPT_OLEN
 # define	IPOPT_OLEN	1
 #endif
 #ifndef	IPPROTO_HOPOPTS
 # define	IPPROTO_HOPOPTS	0
 #endif
 #ifndef	IPPROTO_IPIP
 # define	IPPROTO_IPIP	4
 #endif
 #ifndef	IPPROTO_ENCAP
 # define	IPPROTO_ENCAP	98
 #endif
 #ifndef	IPPROTO_IPV6
 # define	IPPROTO_IPV6	41
 #endif
 #ifndef	IPPROTO_ROUTING
 # define	IPPROTO_ROUTING	43
 #endif
 #ifndef	IPPROTO_FRAGMENT
 # define	IPPROTO_FRAGMENT	44
 #endif
 #ifndef	IPPROTO_GRE
 # define	IPPROTO_GRE	47	/* GRE encaps RFC 1701 */
 #endif
 #ifndef	IPPROTO_ESP
 # define	IPPROTO_ESP	50
 #endif
 #ifndef	IPPROTO_AH
 # define	IPPROTO_AH	51
 #endif
 #ifndef	IPPROTO_ICMPV6
 # define	IPPROTO_ICMPV6	58
 #endif
 #ifndef	IPPROTO_NONE
 # define	IPPROTO_NONE	59
 #endif
 #ifndef	IPPROTO_DSTOPTS
 # define	IPPROTO_DSTOPTS	60
 #endif
 #ifndef	IPPROTO_MOBILITY
 # define	IPPROTO_MOBILITY	135
 #endif
 
 #ifndef	ICMP_ROUTERADVERT
 # define	ICMP_ROUTERADVERT	9
 #endif
 #ifndef	ICMP_ROUTERSOLICIT
 # define	ICMP_ROUTERSOLICIT	10
 #endif
 #ifndef	ICMP6_DST_UNREACH
 # define	ICMP6_DST_UNREACH	1
 #endif
 #ifndef	ICMP6_PACKET_TOO_BIG
 # define	ICMP6_PACKET_TOO_BIG	2
 #endif
 #ifndef	ICMP6_TIME_EXCEEDED
 # define	ICMP6_TIME_EXCEEDED	3
 #endif
 #ifndef	ICMP6_PARAM_PROB
 # define	ICMP6_PARAM_PROB	4
 #endif
 
 #ifndef	ICMP6_ECHO_REQUEST
 # define	ICMP6_ECHO_REQUEST	128
 #endif
 #ifndef	ICMP6_ECHO_REPLY
 # define	ICMP6_ECHO_REPLY	129
 #endif
 #ifndef	ICMP6_MEMBERSHIP_QUERY
 # define	ICMP6_MEMBERSHIP_QUERY	130
 #endif
 #ifndef	MLD6_LISTENER_QUERY
 # define	MLD6_LISTENER_QUERY	130
 #endif
 #ifndef	ICMP6_MEMBERSHIP_REPORT
 # define	ICMP6_MEMBERSHIP_REPORT	131
 #endif
 #ifndef	MLD6_LISTENER_REPORT
 # define	MLD6_LISTENER_REPORT	131
 #endif
 #ifndef	ICMP6_MEMBERSHIP_REDUCTION
 # define	ICMP6_MEMBERSHIP_REDUCTION	132
 #endif
 #ifndef	MLD6_LISTENER_DONE
 # define	MLD6_LISTENER_DONE	132
 #endif
 #ifndef	ND_ROUTER_SOLICIT
 # define	ND_ROUTER_SOLICIT	133
 #endif
 #ifndef	ND_ROUTER_ADVERT
 # define	ND_ROUTER_ADVERT	134
 #endif
 #ifndef	ND_NEIGHBOR_SOLICIT
 # define	ND_NEIGHBOR_SOLICIT	135
 #endif
 #ifndef	ND_NEIGHBOR_ADVERT
 # define	ND_NEIGHBOR_ADVERT	136
 #endif
 #ifndef	ND_REDIRECT
 # define	ND_REDIRECT	137
 #endif
 #ifndef	ICMP6_ROUTER_RENUMBERING
 # define	ICMP6_ROUTER_RENUMBERING	138
 #endif
 #ifndef	ICMP6_WRUREQUEST
 # define	ICMP6_WRUREQUEST	139
 #endif
 #ifndef	ICMP6_WRUREPLY
 # define	ICMP6_WRUREPLY		140
 #endif
 #ifndef	ICMP6_FQDN_QUERY
 # define	ICMP6_FQDN_QUERY	139
 #endif
 #ifndef	ICMP6_FQDN_REPLY
 # define	ICMP6_FQDN_REPLY	140
 #endif
 #ifndef	ICMP6_NI_QUERY
 # define	ICMP6_NI_QUERY		139
 #endif
 #ifndef	ICMP6_NI_REPLY
 # define	ICMP6_NI_REPLY		140
 #endif
 #ifndef	MLD6_MTRACE_RESP
 # define	MLD6_MTRACE_RESP	200
 #endif
 #ifndef	MLD6_MTRACE
 # define	MLD6_MTRACE		201
 #endif
 #ifndef	ICMP6_HADISCOV_REQUEST
 # define	ICMP6_HADISCOV_REQUEST	202
 #endif
 #ifndef	ICMP6_HADISCOV_REPLY
 # define	ICMP6_HADISCOV_REPLY	203
 #endif
 #ifndef	ICMP6_MOBILEPREFIX_SOLICIT
 # define	ICMP6_MOBILEPREFIX_SOLICIT	204
 #endif
 #ifndef	ICMP6_MOBILEPREFIX_ADVERT
 # define	ICMP6_MOBILEPREFIX_ADVERT	205
 #endif
 #ifndef	ICMP6_MAXTYPE
 # define	ICMP6_MAXTYPE		205
 #endif
 
 #ifndef	ICMP6_DST_UNREACH_NOROUTE
 # define	ICMP6_DST_UNREACH_NOROUTE	0
 #endif
 #ifndef	ICMP6_DST_UNREACH_ADMIN
 # define	ICMP6_DST_UNREACH_ADMIN		1
 #endif
 #ifndef	ICMP6_DST_UNREACH_NOTNEIGHBOR
 # define	ICMP6_DST_UNREACH_NOTNEIGHBOR	2
 #endif
 #ifndef	ICMP6_DST_UNREACH_BEYONDSCOPE
 # define	ICMP6_DST_UNREACH_BEYONDSCOPE	2
 #endif
 #ifndef	ICMP6_DST_UNREACH_ADDR
 # define	ICMP6_DST_UNREACH_ADDR		3
 #endif
 #ifndef	ICMP6_DST_UNREACH_NOPORT
 # define	ICMP6_DST_UNREACH_NOPORT	4
 #endif
 #ifndef	ICMP6_TIME_EXCEED_TRANSIT
 # define	ICMP6_TIME_EXCEED_TRANSIT	0
 #endif
 #ifndef	ICMP6_TIME_EXCEED_REASSEMBLY
 # define	ICMP6_TIME_EXCEED_REASSEMBLY	1
 #endif
 
 #ifndef	ICMP6_NI_SUCCESS
 # define	ICMP6_NI_SUCCESS	0
 #endif
 #ifndef	ICMP6_NI_REFUSED
 # define	ICMP6_NI_REFUSED	1
 #endif
 #ifndef	ICMP6_NI_UNKNOWN
 # define	ICMP6_NI_UNKNOWN	2
 #endif
 
 #ifndef	ICMP6_ROUTER_RENUMBERING_COMMAND
 # define	ICMP6_ROUTER_RENUMBERING_COMMAND	0
 #endif
 #ifndef	ICMP6_ROUTER_RENUMBERING_RESULT
 # define	ICMP6_ROUTER_RENUMBERING_RESULT	1
 #endif
 #ifndef	ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET
 # define	ICMP6_ROUTER_RENUMBERING_SEQNUM_RESET	255
 #endif
 
 #ifndef	ICMP6_PARAMPROB_HEADER
 # define	ICMP6_PARAMPROB_HEADER	0
 #endif
 #ifndef	ICMP6_PARAMPROB_NEXTHEADER
 # define	ICMP6_PARAMPROB_NEXTHEADER	1
 #endif
 #ifndef	ICMP6_PARAMPROB_OPTION
 # define	ICMP6_PARAMPROB_OPTION	2
 #endif
 
 #ifndef	ICMP6_NI_SUBJ_IPV6
 # define	ICMP6_NI_SUBJ_IPV6	0
 #endif
 #ifndef	ICMP6_NI_SUBJ_FQDN
 # define	ICMP6_NI_SUBJ_FQDN	1
 #endif
 #ifndef	ICMP6_NI_SUBJ_IPV4
 # define	ICMP6_NI_SUBJ_IPV4	2
 #endif
 
 #ifndef	MLD_MTRACE_RESP
 # define	MLD_MTRACE_RESP		200
 #endif
 #ifndef	MLD_MTRACE
 # define	MLD_MTRACE		201
 #endif
 #ifndef	MLD6_MTRACE_RESP
 # define	MLD6_MTRACE_RESP	MLD_MTRACE_RESP
 #endif
 #ifndef	MLD6_MTRACE
 # define	MLD6_MTRACE		MLD_MTRACE
 #endif
 
 #if !defined(IPV6_FLOWINFO_MASK)
 # if (BYTE_ORDER == BIG_ENDIAN) || defined(_BIG_ENDIAN)
 #  define IPV6_FLOWINFO_MASK	0x0fffffff	/* flow info (28 bits) */
 # else
 #  if(BYTE_ORDER == LITTLE_ENDIAN) || !defined(_BIG_ENDIAN)
 #   define IPV6_FLOWINFO_MASK	0xffffff0f	/* flow info (28 bits) */
 #  endif /* LITTLE_ENDIAN */
 # endif
 #endif
 #if !defined(IPV6_FLOWLABEL_MASK)
 # if (BYTE_ORDER == BIG_ENDIAN) || defined(_BIG_ENDIAN)
 #  define IPV6_FLOWLABEL_MASK	0x000fffff	/* flow label (20 bits) */
 # else
 #  if (BYTE_ORDER == LITTLE_ENDIAN) || !defined(_BIG_ENDIAN)
 #   define IPV6_FLOWLABEL_MASK	0xffff0f00	/* flow label (20 bits) */
 #  endif /* LITTLE_ENDIAN */
 # endif
 #endif
 
 /*
  * ECN is a new addition to TCP - RFC 2481
  */
 #ifndef TH_ECN
 # define	TH_ECN	0x40
 #endif
 #ifndef TH_CWR
 # define	TH_CWR	0x80
 #endif
 #define	TH_ECNALL	(TH_ECN|TH_CWR)
 
 /*
  * TCP States
  */
 #define IPF_TCPS_LISTEN		0	/* listening for connection */
 #define IPF_TCPS_SYN_SENT	1	/* active, have sent syn */
 #define IPF_TCPS_SYN_RECEIVED	2	/* have send and received syn */
 #define IPF_TCPS_HALF_ESTAB	3	/* for connections not fully "up" */
 /* states < IPF_TCPS_ESTABLISHED are those where connections not established */
 #define IPF_TCPS_ESTABLISHED	4	/* established */
 #define IPF_TCPS_CLOSE_WAIT	5	/* rcvd fin, waiting for close */
 /* states > IPF_TCPS_CLOSE_WAIT are those where user has closed */
 #define IPF_TCPS_FIN_WAIT_1	6	/* have closed, sent fin */
 #define IPF_TCPS_CLOSING	7	/* closed xchd FIN; await FIN ACK */
 #define IPF_TCPS_LAST_ACK	8	/* had fin and close; await FIN ACK */
 /* states > IPF_TCPS_CLOSE_WAIT && < IPF_TCPS_FIN_WAIT_2 await ACK of FIN */
 #define IPF_TCPS_FIN_WAIT_2	9	/* have closed, fin is acked */
 #define IPF_TCPS_TIME_WAIT	10	/* in 2*msl quiet wait after close */
 #define IPF_TCPS_CLOSED		11	/* closed */
 #define IPF_TCP_NSTATES		12
 
 #define	TCP_MSL			120
 
 #undef	ICMP_MAX_UNREACH
 #define	ICMP_MAX_UNREACH	14
 #undef	ICMP_MAXTYPE
 #define	ICMP_MAXTYPE		18
 
 #ifndef	LOG_FTP
 # define	LOG_FTP		(11<<3)
 #endif
 #ifndef	LOG_AUTHPRIV
 # define	LOG_AUTHPRIV	(10<<3)
 #endif
 #ifndef	LOG_AUDIT
 # define	LOG_AUDIT	(13<<3)
 #endif
 #ifndef	LOG_NTP
 # define	LOG_NTP		(12<<3)
 #endif
 #ifndef	LOG_SECURITY
 # define	LOG_SECURITY	(13<<3)
 #endif
 #ifndef	LOG_LFMT
 # define	LOG_LFMT	(14<<3)
 #endif
 #ifndef	LOG_CONSOLE
 # define	LOG_CONSOLE	(14<<3)
 #endif
 
 /*
  * ICMP error replies have an IP header (20 bytes), 8 bytes of ICMP data,
  * another IP header and then 64 bits of data, totalling 56.  Of course,
  * the last 64 bits is dependent on that being available.
  */
 #define	ICMPERR_ICMPHLEN	8
 #define	ICMPERR_IPICMPHLEN	(20 + 8)
 #define	ICMPERR_MINPKTLEN	(20 + 8 + 20)
 #define	ICMPERR_MAXPKTLEN	(20 + 8 + 20 + 8)
 #define ICMP6ERR_MINPKTLEN	(40 + 8)
 #define ICMP6ERR_IPICMPHLEN	(40 + 8 + 40)
 
 #ifndef MIN
 # define	MIN(a,b)	(((a)<(b))?(a):(b))
 #endif
 
 #ifdef RESCUE
 # undef IPFILTER_BPF
 #endif
 
 #ifdef IPF_DEBUG
 # define	DPRINT(x)	printf x
 #else
 # define	DPRINT(x)
 #endif
 
 #if defined(DTRACE_PROBE) && defined(_KERNEL)
 # define	DT(_n)			DTRACE_PROBE(_n)
 # define	DT1(_n,_a,_b)		DTRACE_PROBE1(_n,_a,_b)
 # define	DT2(_n,_a,_b,_c,_d)	DTRACE_PROBE2(_n,_a,_b,_c,_d)
 # define	DT3(_n,_a,_b,_c,_d,_e,_f)	\
 					DTRACE_PROBE3(_n,_a,_b,_c,_d,_e,_f)
 # define	DT4(_n,_a,_b,_c,_d,_e,_f,_g,_h) \
 				DTRACE_PROBE4(_n,_a,_b,_c,_d,_e,_f,_g,_h)
 # define	DT5(_n,_a,_b,_c,_d,_e,_f,_g,_h,_i,_j) \
 				DTRACE_PROBE5(_n,_a,_b,_c,_d,_e,_f,_g,_h,_i,_j)
 #else
 # define	DT(_n)
 # define	DT1(_n,_a,_b)
 # define	DT2(_n,_a,_b,_c,_d)
 # define	DT3(_n,_a,_b,_c,_d,_e,_f)
 # define	DT4(_n,_a,_b,_c,_d,_e,_f,_g,_h)
 # define	DT5(_n,_a,_b,_c,_d,_e,_f,_g,_h,_i,_j)
 #endif
 
 struct ip6_routing {
 	u_char	ip6r_nxt;	/* next header */
 	u_char	ip6r_len;	/* length in units of 8 octets */
 	u_char	ip6r_type;	/* always zero */
 	u_char	ip6r_segleft;	/* segments left */
 	u_32_t	ip6r_reserved;	/* reserved field */
 };
 
 #endif	/* __IP_COMPAT_H__ */
diff --git a/sys/netpfil/ipfilter/netinet/ip_log.c b/sys/netpfil/ipfilter/netinet/ip_log.c
index 6e384ac44e83..e014931df153 100644
--- a/sys/netpfil/ipfilter/netinet/ip_log.c
+++ b/sys/netpfil/ipfilter/netinet/ip_log.c
@@ -1,860 +1,861 @@
 /*	$FreeBSD$	*/
 
 /*
  * Copyright (C) 2012 by Darren Reed.
  *
  * See the IPFILTER.LICENCE file for details on licencing.
  *
  * $FreeBSD$
  * Id: ip_log.c,v 2.75.2.19 2007/09/09 11:32:06 darrenr Exp $
  */
 #include <sys/param.h>
 #if defined(KERNEL) || defined(_KERNEL)
 # undef KERNEL
 # undef _KERNEL
 # define        KERNEL	1
 # define        _KERNEL	1
 #endif
 #if defined(__FreeBSD__) && !defined(_KERNEL)
 # include <osreldate.h>
 #endif
 #ifndef SOLARIS
 # if defined(sun) && defined(__SVR4)
 #  define	SOLARIS		1
 # else
 #  define	SOLARIS		0
 # endif
 #endif
 #include <sys/errno.h>
 #include <sys/types.h>
 #include <sys/file.h>
 #ifndef _KERNEL
 # include <stdio.h>
 # include <string.h>
 # include <stdlib.h>
 # include <ctype.h>
 # define _KERNEL
 # define KERNEL
 # include <sys/uio.h>
 # undef _KERNEL
 # undef KERNEL
 #endif
 #if defined(__FreeBSD__) && defined(_KERNEL)
 # include <sys/fcntl.h>
 # include <sys/filio.h>
 #else
 # include <sys/ioctl.h>
 #endif
 #include <sys/time.h>
 #if defined(_KERNEL)
 # include <sys/systm.h>
 # if (defined(NetBSD) && (__NetBSD_Version__ >= 104000000))
 #  include <sys/proc.h>
 # endif
 #endif /* _KERNEL */
 # if defined(NetBSD) || defined(__FreeBSD__)
 #  include <sys/dirent.h>
 # include <sys/mbuf.h>
 # include <sys/select.h>
 # endif
 # if defined(__FreeBSD__)
 #  include <sys/selinfo.h>
 # endif
 #if SOLARIS && defined(_KERNEL)
 #  include <sys/filio.h>
 #  include <sys/cred.h>
 #  include <sys/ddi.h>
 #  include <sys/sunddi.h>
 #  include <sys/ksynch.h>
 #  include <sys/kmem.h>
 #  include <sys/mkdev.h>
 #  include <sys/dditypes.h>
 #  include <sys/cmn_err.h>
 #endif /* SOLARIS && _KERNEL */
 # include <sys/protosw.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #ifdef sun
 # include <net/af.h>
 #endif
 #if defined(__FreeBSD__)
 # include <net/if_var.h>
+# include <net/if_private.h>
 #endif
 #include <netinet/in.h>
 # include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet/ip_icmp.h>
 #ifdef USE_INET6
 # include <netinet/icmp6.h>
 #endif
 # include <netinet/ip_var.h>
 #ifndef _KERNEL
 # include <syslog.h>
 #endif
 #include "netinet/ip_compat.h"
 #include <netinet/tcpip.h>
 #include "netinet/ip_fil.h"
 #include "netinet/ip_nat.h"
 #include "netinet/ip_frag.h"
 #include "netinet/ip_state.h"
 #include "netinet/ip_auth.h"
 #if defined(__FreeBSD__) || defined(__NetBSD__)
 # include <sys/malloc.h>
 #endif
 /* END OF INCLUDES */
 
 #ifdef	IPFILTER_LOG
 
 typedef struct ipf_log_softc_s {
 	ipfmutex_t	ipl_mutex[IPL_LOGSIZE];
 # if SOLARIS && defined(_KERNEL)
 	kcondvar_t	ipl_wait[IPL_LOGSIZE];
 # endif
 	iplog_t		**iplh[IPL_LOGSIZE];
 	iplog_t		*iplt[IPL_LOGSIZE];
 	iplog_t		*ipll[IPL_LOGSIZE];
 	u_long		ipl_logfail[IPL_LOGSIZE];
 	u_long		ipl_logok[IPL_LOGSIZE];
 	fr_info_t	ipl_crc[IPL_LOGSIZE];
 	u_32_t		ipl_counter[IPL_LOGSIZE];
 	int		ipl_suppress;
 	int		ipl_logall;
 	int		ipl_log_init;
 	int		ipl_logsize;
 	int		ipl_used[IPL_LOGSIZE];
 	int		ipl_magic[IPL_LOGSIZE];
 	ipftuneable_t	*ipf_log_tune;
 	int		ipl_readers[IPL_LOGSIZE];
 } ipf_log_softc_t;
 
 static int magic[IPL_LOGSIZE] = { IPL_MAGIC, IPL_MAGIC_NAT, IPL_MAGIC_STATE,
 				  IPL_MAGIC, IPL_MAGIC, IPL_MAGIC,
 				  IPL_MAGIC, IPL_MAGIC };
 
 static ipftuneable_t ipf_log_tuneables[] = {
 	/* log */
 	{ { (void *)offsetof(ipf_log_softc_t, ipl_suppress) },
 		"log_suppress",		0,	1,
 		stsizeof(ipf_log_softc_t, ipl_suppress),
 		0,			NULL,	NULL },
 	{ { (void *)offsetof(ipf_log_softc_t, ipl_logall) },
 		"log_all",		0,	1,
 		stsizeof(ipf_log_softc_t, ipl_logall),
 		0,			NULL,	NULL },
 	{ { (void *)offsetof(ipf_log_softc_t, ipl_logsize) },
 		"log_size",		0,	0x80000,
 		stsizeof(ipf_log_softc_t, ipl_logsize),
 		0,			NULL,	NULL },
 	{ { NULL },		NULL,			0,	0,
 		0,
 		0,			NULL,	NULL }
 };
 
 
 int
 ipf_log_main_load(void)
 {
 	return (0);
 }
 
 
 int
 ipf_log_main_unload(void)
 {
 	return (0);
 }
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_soft_create                                         */
 /* Returns:     void * - NULL = failure, else pointer to log context data   */
 /* Parameters:  softc(I) - pointer to soft context main structure           */
 /*                                                                          */
 /* Initialise log buffers & pointers.  Also iniialised the CRC to a local   */
 /* secret for use in calculating the "last log checksum".                   */
 /* ------------------------------------------------------------------------ */
 void *
 ipf_log_soft_create(ipf_main_softc_t *softc)
 {
 	ipf_log_softc_t *softl;
 	int i;
 
 	KMALLOC(softl, ipf_log_softc_t *);
 	if (softl == NULL)
 		return (NULL);
 
 	bzero((char *)softl, sizeof(*softl));
 	bcopy((char *)magic, (char *)softl->ipl_magic, sizeof(magic));
 
 	softl->ipf_log_tune = ipf_tune_array_copy(softl,
 						  sizeof(ipf_log_tuneables),
 						  ipf_log_tuneables);
 	if (softl->ipf_log_tune == NULL) {
 		ipf_log_soft_destroy(softc, softl);
 		return (NULL);
 	}
 	if (ipf_tune_array_link(softc, softl->ipf_log_tune) == -1) {
 		ipf_log_soft_destroy(softc, softl);
 		return (NULL);
 	}
 
 	for (i = IPL_LOGMAX; i >= 0; i--) {
 		MUTEX_INIT(&softl->ipl_mutex[i], "ipf log mutex");
 	}
 
 	softl->ipl_suppress = 1;
 	softl->ipl_logall = 0;
 	softl->ipl_log_init = 0;
 	softl->ipl_logsize = IPFILTER_LOGSIZE;
 
 	return (softl);
 }
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_soft_init                                           */
 /* Returns:     int - 0 == success (always returned)                        */
 /* Parameters:  softc(I) - pointer to soft context main structure           */
 /*                                                                          */
 /* Initialise log buffers & pointers.  Also iniialised the CRC to a local   */
 /* secret for use in calculating the "last log checksum".                   */
 /* ------------------------------------------------------------------------ */
 int
 ipf_log_soft_init(ipf_main_softc_t *softc, void *arg)
 {
 	ipf_log_softc_t *softl = arg;
 	int i;
 
 	for (i = IPL_LOGMAX; i >= 0; i--) {
 		softl->iplt[i] = NULL;
 		softl->ipll[i] = NULL;
 		softl->iplh[i] = &softl->iplt[i];
 		bzero((char *)&softl->ipl_crc[i], sizeof(softl->ipl_crc[i]));
 	}
 
 
 	softl->ipl_log_init = 1;
 
 	return (0);
 }
 
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_soft_fini                                           */
 /* Parameters:  softc(I) - pointer to soft context main structure           */
 /*              arg(I)   - pointer to log context structure                 */
 /*                                                                          */
 /* Clean up any log data that has accumulated without being read.           */
 /* ------------------------------------------------------------------------ */
 int
 ipf_log_soft_fini(ipf_main_softc_t *softc, void *arg)
 {
 	ipf_log_softc_t *softl = arg;
 	int i;
 
 	if (softl->ipl_log_init == 0)
 		return (0);
 
 	softl->ipl_log_init = 0;
 
 	for (i = IPL_LOGMAX; i >= 0; i--) {
 		(void) ipf_log_clear(softc, i);
 
 		/*
 		 * This is a busy-wait loop so as to avoid yet another lock
 		 * to wait on.
 		 */
 		MUTEX_ENTER(&softl->ipl_mutex[i]);
 		while (softl->ipl_readers[i] > 0) {
 # if SOLARIS && defined(_KERNEL)
 			cv_broadcast(&softl->ipl_wait[i]);
 			MUTEX_EXIT(&softl->ipl_mutex[i]);
 			delay(100);
 			pollwakeup(&softc->ipf_poll_head[i], POLLRDNORM);
 # else
 			MUTEX_EXIT(&softl->ipl_mutex[i]);
 			WAKEUP(softl->iplh, i);
 			POLLWAKEUP(i);
 # endif
 			MUTEX_ENTER(&softl->ipl_mutex[i]);
 		}
 		MUTEX_EXIT(&softl->ipl_mutex[i]);
 	}
 
 	return (0);
 }
 
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_soft_destroy                                        */
 /* Parameters:  softc(I) - pointer to soft context main structure           */
 /*              arg(I)   - pointer to log context structure                 */
 /*                                                                          */
 /* When this function is called, it is expected that there are no longer    */
 /* any threads active in the reading code path or the logging code path.    */
 /* ------------------------------------------------------------------------ */
 void
 ipf_log_soft_destroy(ipf_main_softc_t *softc, void *arg)
 {
 	ipf_log_softc_t *softl = arg;
 	int i;
 
 	for (i = IPL_LOGMAX; i >= 0; i--) {
 # if SOLARIS && defined(_KERNEL)
 		cv_destroy(&softl->ipl_wait[i]);
 # endif
 		MUTEX_DESTROY(&softl->ipl_mutex[i]);
 	}
 
 	if (softl->ipf_log_tune != NULL) {
 		ipf_tune_array_unlink(softc, softl->ipf_log_tune);
 		KFREES(softl->ipf_log_tune, sizeof(ipf_log_tuneables));
 		softl->ipf_log_tune = NULL;
 	}
 
 	KFREE(softl);
 }
 
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_pkt                                                 */
 /* Returns:     int      - 0 == success, -1 == failure                      */
 /* Parameters:  fin(I)   - pointer to packet information                    */
 /*              flags(I) - flags from filter rules                          */
 /*                                                                          */
 /* Create a log record for a packet given that it has been triggered by a   */
 /* rule (or the default setting).  Calculate the transport protocol header  */
 /* size using predetermined size of a couple of popular protocols and thus  */
 /* how much data to copy into the log, including part of the data body if   */
 /* requested.                                                               */
 /* ------------------------------------------------------------------------ */
 int
 ipf_log_pkt(fr_info_t *fin, u_int flags)
 {
 	ipf_main_softc_t *softc = fin->fin_main_soft;
 	ipf_log_softc_t *softl = softc->ipf_log_soft;
 	register size_t hlen;
 	int types[2], mlen;
 	size_t sizes[2];
 	void *ptrs[2];
 	ipflog_t ipfl;
 	u_char p;
 	mb_t *m;
 # if SOLARIS && defined(_KERNEL) && !defined(FW_HOOKS)
 	qif_t *ifp;
 # else
 	struct ifnet *ifp;
 # endif /* SOLARIS */
 
 	m = fin->fin_m;
 	if (m == NULL)
 		return (-1);
 
 	ipfl.fl_nattag.ipt_num[0] = 0;
 	ifp = fin->fin_ifp;
 	hlen = (char *)fin->fin_dp - (char *)fin->fin_ip;
 
 	/*
 	 * calculate header size.
 	 */
 	if (fin->fin_off == 0) {
 		p = fin->fin_fi.fi_p;
 		if (p == IPPROTO_TCP)
 			hlen += MIN(sizeof(tcphdr_t), fin->fin_dlen);
 		else if (p == IPPROTO_UDP)
 			hlen += MIN(sizeof(udphdr_t), fin->fin_dlen);
 		else if (p == IPPROTO_ICMP) {
 			struct icmp *icmp;
 
 			icmp = (struct icmp *)fin->fin_dp;
 
 			/*
 			 * For ICMP, if the packet is an error packet, also
 			 * include the information about the packet which
 			 * caused the error.
 			 */
 			switch (icmp->icmp_type)
 			{
 			case ICMP_UNREACH :
 			case ICMP_SOURCEQUENCH :
 			case ICMP_REDIRECT :
 			case ICMP_TIMXCEED :
 			case ICMP_PARAMPROB :
 				hlen += MIN(sizeof(struct icmp) + 8,
 					    fin->fin_dlen);
 				break;
 			default :
 				hlen += MIN(sizeof(struct icmp),
 					    fin->fin_dlen);
 				break;
 			}
 		}
 # ifdef USE_INET6
 		else if (p == IPPROTO_ICMPV6) {
 			struct icmp6_hdr *icmp;
 
 			icmp = (struct icmp6_hdr *)fin->fin_dp;
 
 			/*
 			 * For ICMPV6, if the packet is an error packet, also
 			 * include the information about the packet which
 			 * caused the error.
 			 */
 			if (icmp->icmp6_type < 128) {
 				hlen += MIN(sizeof(struct icmp6_hdr) + 8,
 					    fin->fin_dlen);
 			} else {
 				hlen += MIN(sizeof(struct icmp6_hdr),
 					    fin->fin_dlen);
 			}
 		}
 # endif
 	}
 	/*
 	 * Get the interface number and name to which this packet is
 	 * currently associated.
 	 */
 # if SOLARIS && defined(_KERNEL)
 #  if !defined(FW_HOOKS)
 	ipfl.fl_unit = (u_int)ifp->qf_ppa;
 #  endif
 	COPYIFNAME(fin->fin_v, ifp, ipfl.fl_ifname);
 # else
 #  if (defined(NetBSD) && (NetBSD  <= 1991011) && (NetBSD >= 199603)) || \
       defined(__FreeBSD__)
 	COPYIFNAME(fin->fin_v, ifp, ipfl.fl_ifname);
 #  else
 	ipfl.fl_unit = (u_int)ifp->if_unit;
 #   if defined(_KERNEL)
 	if ((ipfl.fl_ifname[0] = ifp->if_name[0]))
 		if ((ipfl.fl_ifname[1] = ifp->if_name[1]))
 			if ((ipfl.fl_ifname[2] = ifp->if_name[2]))
 				ipfl.fl_ifname[3] = ifp->if_name[3];
 #   else
 	(void) strncpy(ipfl.fl_ifname, IFNAME(ifp), sizeof(ipfl.fl_ifname));
 	ipfl.fl_ifname[sizeof(ipfl.fl_ifname) - 1] = '\0';
 #   endif
 #  endif
 # endif /* __hpux || SOLARIS */
 	mlen = fin->fin_plen - hlen;
 	if (!softl->ipl_logall) {
 		mlen = (flags & FR_LOGBODY) ? MIN(mlen, 128) : 0;
 	} else if ((flags & FR_LOGBODY) == 0) {
 		mlen = 0;
 	}
 	if (mlen < 0)
 		mlen = 0;
 	ipfl.fl_plen = (u_char)mlen;
 	ipfl.fl_hlen = (u_char)hlen;
 	ipfl.fl_rule = fin->fin_rule;
 	(void) strncpy(ipfl.fl_group, fin->fin_group, FR_GROUPLEN);
 	if (fin->fin_fr != NULL) {
 		ipfl.fl_loglevel = fin->fin_fr->fr_loglevel;
 		ipfl.fl_logtag = fin->fin_fr->fr_logtag;
 	} else {
 		ipfl.fl_loglevel = 0xffff;
 		ipfl.fl_logtag = FR_NOLOGTAG;
 	}
 	if (fin->fin_nattag != NULL)
 		bcopy(fin->fin_nattag, (void *)&ipfl.fl_nattag,
 		      sizeof(ipfl.fl_nattag));
 	ipfl.fl_flags = flags;
 	ipfl.fl_breason = (fin->fin_reason & 0xff);
 	ipfl.fl_dir = fin->fin_out;
 	ipfl.fl_lflags = fin->fin_flx;
 	ipfl.fl_family = fin->fin_family;
 	ptrs[0] = (void *)&ipfl;
 	sizes[0] = sizeof(ipfl);
 	types[0] = 0;
 # if SOLARIS && defined(_KERNEL)
 	/*
 	 * Are we copied from the mblk or an aligned array ?
 	 */
 	if (fin->fin_ip == (ip_t *)m->b_rptr) {
 		ptrs[1] = m;
 		sizes[1] = hlen + mlen;
 		types[1] = 1;
 	} else {
 		ptrs[1] = fin->fin_ip;
 		sizes[1] = hlen + mlen;
 		types[1] = 0;
 	}
 # else
 	ptrs[1] = m;
 	sizes[1] = hlen + mlen;
 	types[1] = 1;
 # endif /* SOLARIS */
 	return (ipf_log_items(softc, IPL_LOGIPF, fin, ptrs, sizes, types, 2));
 }
 
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_items                                               */
 /* Returns:     int       - 0 == success, -1 == failure                     */
 /* Parameters:  softc(I)  - pointer to main soft context                    */
 /*              unit(I)   - device we are reading from                      */
 /*              fin(I)    - pointer to packet information                   */
 /*              items(I)  - array of pointers to log data                   */
 /*              itemsz(I) - array of size of valid memory pointed to        */
 /*              types(I)  - type of data pointed to by items pointers       */
 /*              cnt(I)    - number of elements in arrays items/itemsz/types */
 /*                                                                          */
 /* Takes an array of parameters and constructs one record to include the    */
 /* miscellaneous packet information, as well as packet data, for reading    */
 /* from the log device.                                                     */
 /* ------------------------------------------------------------------------ */
 int
 ipf_log_items(ipf_main_softc_t *softc, int unit, fr_info_t *fin, void **items,
 	size_t *itemsz, int *types, int cnt)
 {
 	ipf_log_softc_t *softl = softc->ipf_log_soft;
 	caddr_t buf, ptr;
 	iplog_t *ipl;
 	size_t len;
 	int i;
 	SPL_INT(s);
 
 	/*
 	 * Get the total amount of data to be logged.
 	 */
 	for (i = 0, len = sizeof(iplog_t); i < cnt; i++)
 		len += itemsz[i];
 
 	SPL_NET(s);
 	MUTEX_ENTER(&softl->ipl_mutex[unit]);
 	softl->ipl_counter[unit]++;
 	/*
 	 * check that we have space to record this information and can
 	 * allocate that much.
 	 */
 	if ((softl->ipl_used[unit] + len) > softl->ipl_logsize) {
 		softl->ipl_logfail[unit]++;
 		MUTEX_EXIT(&softl->ipl_mutex[unit]);
 		return (-1);
 	}
 
 	KMALLOCS(buf, caddr_t, len);
 	if (buf == NULL) {
 		softl->ipl_logfail[unit]++;
 		MUTEX_EXIT(&softl->ipl_mutex[unit]);
 		return (-1);
 	}
 	ipl = (iplog_t *)buf;
 	ipl->ipl_magic = softl->ipl_magic[unit];
 	ipl->ipl_count = 1;
 	ipl->ipl_seqnum = softl->ipl_counter[unit];
 	ipl->ipl_next = NULL;
 	ipl->ipl_dsize = len;
 #ifdef _KERNEL
 	GETKTIME(&ipl->ipl_sec);
 #else
 	ipl->ipl_sec = 0;
 	ipl->ipl_usec = 0;
 #endif
 
 	/*
 	 * Loop through all the items to be logged, copying each one to the
 	 * buffer.  Use bcopy for normal data or the mb_t copyout routine.
 	 */
 	for (i = 0, ptr = buf + sizeof(*ipl); i < cnt; i++) {
 		if (types[i] == 0) {
 			bcopy(items[i], ptr, itemsz[i]);
 		} else if (types[i] == 1) {
 			COPYDATA(items[i], 0, itemsz[i], ptr);
 		}
 		ptr += itemsz[i];
 	}
 	/*
 	 * Check to see if this log record has a CRC which matches the last
 	 * record logged.  If it does, just up the count on the previous one
 	 * rather than create a new one.
 	 */
 	if (softl->ipl_suppress) {
 		if ((fin != NULL) && (fin->fin_off == 0)) {
 			if ((softl->ipll[unit] != NULL) &&
 			    (fin->fin_crc == softl->ipl_crc[unit].fin_crc) &&
 			    bcmp((char *)fin, (char *)&softl->ipl_crc[unit],
 				 FI_LCSIZE) == 0) {
 				softl->ipll[unit]->ipl_count++;
 				MUTEX_EXIT(&softl->ipl_mutex[unit]);
 				SPL_X(s);
 				KFREES(buf, len);
 				return (0);
 			}
 			bcopy((char *)fin, (char *)&softl->ipl_crc[unit],
 			      FI_LCSIZE);
 			softl->ipl_crc[unit].fin_crc = fin->fin_crc;
 		} else
 			bzero((char *)&softl->ipl_crc[unit], FI_CSIZE);
 	}
 
 	/*
 	 * advance the log pointer to the next empty record and deduct the
 	 * amount of space we're going to use.
 	 */
 	softl->ipl_logok[unit]++;
 	softl->ipll[unit] = ipl;
 	*softl->iplh[unit] = ipl;
 	softl->iplh[unit] = &ipl->ipl_next;
 	softl->ipl_used[unit] += len;
 
 	/*
 	 * Now that the log record has been completed and added to the queue,
 	 * wake up any listeners who may want to read it.
 	 */
 # if SOLARIS && defined(_KERNEL)
 	cv_signal(&softl->ipl_wait[unit]);
 	MUTEX_EXIT(&softl->ipl_mutex[unit]);
 	pollwakeup(&softc->ipf_poll_head[unit], POLLRDNORM);
 # else
 	MUTEX_EXIT(&softl->ipl_mutex[unit]);
 	WAKEUP(softl->iplh, unit);
 	POLLWAKEUP(unit);
 # endif
 	SPL_X(s);
 	return (0);
 }
 
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_read                                                */
 /* Returns:     int      - 0 == success, else error value.                  */
 /* Parameters:  softc(I) - pointer to main soft context                     */
 /*              unit(I)  - device we are reading from                       */
 /*              uio(O)   - pointer to information about where to store data */
 /*                                                                          */
 /* Called to handle a read on an IPFilter device.  Returns only complete    */
 /* log messages - will not partially copy a log record out to userland.     */
 /*                                                                          */
 /* NOTE: This function will block and wait for a signal to return data if   */
 /* there is none present.  Asynchronous I/O is not implemented.             */
 /* ------------------------------------------------------------------------ */
 int
 ipf_log_read(ipf_main_softc_t *softc, minor_t unit, struct uio *uio)
 {
 	ipf_log_softc_t *softl = softc->ipf_log_soft;
 	size_t dlen;
 	int error = 0;
 	iplog_t *ipl;
 	SPL_INT(s);
 
 	if (softl->ipl_log_init == 0) {
 		IPFERROR(40007);
 		return (0);
 	}
 
 	/*
 	 * Sanity checks.  Make sure the minor # is valid and we're copying
 	 * a valid chunk of data.
 	 */
 	if (IPL_LOGMAX < unit) {
 		IPFERROR(40001);
 		return (ENXIO);
 	}
 	if (uio->uio_resid == 0)
 		return (0);
 
 	if (uio->uio_resid < sizeof(iplog_t)) {
 		IPFERROR(40002);
 		return (EINVAL);
 	}
 	if (uio->uio_resid > softl->ipl_logsize) {
 		IPFERROR(40005);
 		return (EINVAL);
 	}
 
 	/*
 	 * Lock the log so we can snapshot the variables.  Wait for a signal
 	 * if the log is empty.
 	 */
 	SPL_NET(s);
 	MUTEX_ENTER(&softl->ipl_mutex[unit]);
 	softl->ipl_readers[unit]++;
 
 	while (softl->ipl_log_init == 1 && softl->iplt[unit] == NULL) {
 # if SOLARIS && defined(_KERNEL)
 		if (!cv_wait_sig(&softl->ipl_wait[unit],
 				 &softl->ipl_mutex[unit].ipf_lk)) {
 			softl->ipl_readers[unit]--;
 			MUTEX_EXIT(&softl->ipl_mutex[unit]);
 			IPFERROR(40003);
 			return (EINTR);
 		}
 # else
 		MUTEX_EXIT(&softl->ipl_mutex[unit]);
 		SPL_X(s);
 		error = SLEEP(unit + softl->iplh, "ipl sleep");
 		SPL_NET(s);
 		MUTEX_ENTER(&softl->ipl_mutex[unit]);
 		if (error) {
 			softl->ipl_readers[unit]--;
 			MUTEX_EXIT(&softl->ipl_mutex[unit]);
 			IPFERROR(40004);
 			return (error);
 		}
 # endif /* SOLARIS */
 	}
 	if (softl->ipl_log_init != 1) {
 		softl->ipl_readers[unit]--;
 		MUTEX_EXIT(&softl->ipl_mutex[unit]);
 		IPFERROR(40008);
 		return (EIO);
 	}
 
 # if defined(BSD)
 	uio->uio_rw = UIO_READ;
 # endif
 
 	for (; (ipl = softl->iplt[unit]) != NULL;) {
 		dlen = ipl->ipl_dsize;
 		if (dlen > uio->uio_resid)
 			break;
 		/*
 		 * Don't hold the mutex over the uiomove call.
 		 */
 		softl->iplt[unit] = ipl->ipl_next;
 		softl->ipl_used[unit] -= dlen;
 		MUTEX_EXIT(&softl->ipl_mutex[unit]);
 		SPL_X(s);
 		error = UIOMOVE(ipl, dlen, UIO_READ, uio);
 		if (error) {
 			SPL_NET(s);
 			MUTEX_ENTER(&softl->ipl_mutex[unit]);
 			IPFERROR(40006);
 			ipl->ipl_next = softl->iplt[unit];
 			softl->iplt[unit] = ipl;
 			softl->ipl_used[unit] += dlen;
 			break;
 		}
 		MUTEX_ENTER(&softl->ipl_mutex[unit]);
 		KFREES((caddr_t)ipl, dlen);
 		SPL_NET(s);
 	}
 	if (!softl->iplt[unit]) {
 		softl->ipl_used[unit] = 0;
 		softl->iplh[unit] = &softl->iplt[unit];
 		softl->ipll[unit] = NULL;
 	}
 
 	softl->ipl_readers[unit]--;
 	MUTEX_EXIT(&softl->ipl_mutex[unit]);
 	SPL_X(s);
 	return (error);
 }
 
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_clear                                               */
 /* Returns:     int      - number of log bytes cleared.                     */
 /* Parameters:  softc(I) - pointer to main soft context                     */
 /*              unit(I)  - device we are reading from                       */
 /*                                                                          */
 /* Deletes all queued up log records for a given output device.             */
 /* ------------------------------------------------------------------------ */
 int
 ipf_log_clear(ipf_main_softc_t *softc, minor_t unit)
 {
 	ipf_log_softc_t *softl = softc->ipf_log_soft;
 	iplog_t *ipl;
 	int used;
 	SPL_INT(s);
 
 	SPL_NET(s);
 	MUTEX_ENTER(&softl->ipl_mutex[unit]);
 	while ((ipl = softl->iplt[unit]) != NULL) {
 		softl->iplt[unit] = ipl->ipl_next;
 		KFREES((caddr_t)ipl, ipl->ipl_dsize);
 	}
 	softl->iplh[unit] = &softl->iplt[unit];
 	softl->ipll[unit] = NULL;
 	used = softl->ipl_used[unit];
 	softl->ipl_used[unit] = 0;
 	bzero((char *)&softl->ipl_crc[unit], FI_CSIZE);
 	MUTEX_EXIT(&softl->ipl_mutex[unit]);
 	SPL_X(s);
 	return (used);
 }
 
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_canread                                             */
 /* Returns:     int      - 0 == no data to read, 1 = data present           */
 /* Parameters:  softc(I) - pointer to main soft context                     */
 /*              unit(I)  - device we are reading from                       */
 /*                                                                          */
 /* Returns an indication of whether or not there is data present in the     */
 /* current buffer for the selected ipf device.                              */
 /* ------------------------------------------------------------------------ */
 int
 ipf_log_canread(ipf_main_softc_t *softc, int unit)
 {
 	ipf_log_softc_t *softl = softc->ipf_log_soft;
 
 	return (softl->iplt[unit] != NULL);
 }
 
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_canread                                             */
 /* Returns:     int      - 0 == no data to read, 1 = data present           */
 /* Parameters:  softc(I) - pointer to main soft context                     */
 /*              unit(I)  - device we are reading from                       */
 /*                                                                          */
 /* Returns how many bytes are currently held in log buffers for the         */
 /* selected ipf device.                                                     */
 /* ------------------------------------------------------------------------ */
 int
 ipf_log_bytesused(ipf_main_softc_t *softc, int unit)
 {
 	ipf_log_softc_t *softl = softc->ipf_log_soft;
 
 	if (softl == NULL)
 		return (0);
 
 	return (softl->ipl_used[unit]);
 }
 
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_failures                                            */
 /* Returns:     U_QUAD_T - number of log failures                           */
 /* Parameters:  softc(I) - pointer to main soft context                     */
 /*              unit(I)  - device we are reading from                       */
 /*                                                                          */
 /* Returns how many times we've tried to log a packet but failed to do so   */
 /* for the selected ipf device.                                             */
 /* ------------------------------------------------------------------------ */
 u_long
 ipf_log_failures(ipf_main_softc_t *softc, int unit)
 {
 	ipf_log_softc_t *softl = softc->ipf_log_soft;
 
 	if (softl == NULL)
 		return (0);
 
 	return (softl->ipl_logfail[unit]);
 }
 
 
 /* ------------------------------------------------------------------------ */
 /* Function:    ipf_log_logok                                               */
 /* Returns:     U_QUAD_T - number of packets logged                         */
 /* Parameters:  softc(I) - pointer to main soft context                     */
 /*              unit(I)  - device we are reading from                       */
 /*                                                                          */
 /* Returns how many times we've successfully logged a packet for the        */
 /* selected ipf device.                                                     */
 /* ------------------------------------------------------------------------ */
 u_long
 ipf_log_logok(ipf_main_softc_t *softc, int unit)
 {
 	ipf_log_softc_t *softl = softc->ipf_log_soft;
 
 	if (softl == NULL)
 		return (0);
 
 	return (softl->ipl_logok[unit]);
 }
 #endif /* IPFILTER_LOG */
diff --git a/sys/netpfil/ipfw/ip_dn_io.c b/sys/netpfil/ipfw/ip_dn_io.c
index 090efd303858..80ea222459e3 100644
--- a/sys/netpfil/ipfw/ip_dn_io.c
+++ b/sys/netpfil/ipfw/ip_dn_io.c
@@ -1,990 +1,991 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa
  * All rights reserved
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Dummynet portions related to packet handling.
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/time.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
 #include <net/if_var.h>	/* NET_EPOCH_... */
+#include <net/if_private.h>
 #include <net/netisr.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>		/* ip_len, ip_off */
 #include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
 #include <netinet/ip_fw.h>
 #include <netinet/ip_dummynet.h>
 #include <netinet/if_ether.h> /* various ether_* routines */
 #include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
 #include <netinet6/ip6_var.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/dn_heap.h>
 #include <netpfil/ipfw/ip_dn_private.h>
 #ifdef NEW_AQM
 #include <netpfil/ipfw/dn_aqm.h>
 #endif
 #include <netpfil/ipfw/dn_sched.h>
 
 /*
  * We keep a private variable for the simulation time, but we could
  * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
  * instead of V_dn_cfg.curr_time
  */
 VNET_DEFINE(struct dn_parms, dn_cfg);
 #define V_dn_cfg VNET(dn_cfg)
 
 /*
  * We use a heap to store entities for which we have pending timer events.
  * The heap is checked at every tick and all entities with expired events
  * are extracted.
  */
   
 MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
 
 extern	void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
 
 #ifdef SYSCTL_NODE
 
 /*
  * Because of the way the SYSBEGIN/SYSEND macros work on other
  * platforms, there should not be functions between them.
  * So keep the handlers outside the block.
  */
 static int
 sysctl_hash_size(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = V_dn_cfg.hash_size;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (value < 16 || value > 65536)
 		return (EINVAL);
 	V_dn_cfg.hash_size = value;
 	return (0);
 }
 
 static int
 sysctl_limits(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	long value;
 
 	if (arg2 != 0)
 		value = V_dn_cfg.slot_limit;
 	else
 		value = V_dn_cfg.byte_limit;
 	error = sysctl_handle_long(oidp, &value, 0, req);
 
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (arg2 != 0) {
 		if (value < 1)
 			return (EINVAL);
 		V_dn_cfg.slot_limit = value;
 	} else {
 		if (value < 1500)
 			return (EINVAL);
 		V_dn_cfg.byte_limit = value;
 	}
 	return (0);
 }
 
 SYSBEGIN(f4)
 
 SYSCTL_DECL(_net_inet);
 SYSCTL_DECL(_net_inet_ip);
 #ifdef NEW_AQM
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Dummynet");
 #else
 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Dummynet");
 #endif
 
 /* wrapper to pass V_dn_cfg fields to SYSCTL_* */
 #define DC(x)	(&(VNET_NAME(dn_cfg).x))
 
 /* parameters */
 
 SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     0, 0, sysctl_hash_size, "I",
     "Default hash table size");
 
 SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
     CTLTYPE_LONG | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     0, 1, sysctl_limits, "L",
     "Upper limit in slots for pipe queue.");
 SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
     CTLTYPE_LONG | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     0, 0, sysctl_limits, "L",
     "Upper limit in bytes for pipe queue.");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
     CTLFLAG_RW | CTLFLAG_VNET, DC(io_fast), 0, "Enable fast dummynet io.");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug,
     CTLFLAG_RW | CTLFLAG_VNET, DC(debug), 0, "Dummynet debug level");
 
 /* RED parameters */
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
     CTLFLAG_RD | CTLFLAG_VNET, DC(red_lookup_depth), 0, "Depth of RED lookup table");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
     CTLFLAG_RD | CTLFLAG_VNET, DC(red_avg_pkt_size), 0, "RED Medium packet size");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
     CTLFLAG_RD | CTLFLAG_VNET, DC(red_max_pkt_size), 0, "RED Max packet size");
 
 /* time adjustment */
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
     CTLFLAG_RD | CTLFLAG_VNET, DC(tick_delta), 0, "Last vs standard tick difference (usec).");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
     CTLFLAG_RD | CTLFLAG_VNET, DC(tick_delta_sum), 0, "Accumulated tick difference (usec).");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
     CTLFLAG_RD | CTLFLAG_VNET, DC(tick_adjustment), 0, "Tick adjustments done.");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
     CTLFLAG_RD | CTLFLAG_VNET, DC(tick_diff), 0,
     "Adjusted vs non-adjusted curr_time difference (ticks).");
 SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
     CTLFLAG_RD | CTLFLAG_VNET, DC(tick_lost), 0,
     "Number of ticks coalesced by dummynet taskqueue.");
 
 /* Drain parameters */
 SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire,
     CTLFLAG_RW | CTLFLAG_VNET, DC(expire), 0, "Expire empty queues/pipes");
 SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle,
     CTLFLAG_RD | CTLFLAG_VNET, DC(expire_cycle), 0, "Expire cycle for queues/pipes");
 
 /* statistics */
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count,
     CTLFLAG_RD | CTLFLAG_VNET, DC(schk_count), 0, "Number of schedulers");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count,
     CTLFLAG_RD | CTLFLAG_VNET, DC(si_count), 0, "Number of scheduler instances");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count,
     CTLFLAG_RD | CTLFLAG_VNET, DC(fsk_count), 0, "Number of flowsets");
 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count,
     CTLFLAG_RD | CTLFLAG_VNET, DC(queue_count), 0, "Number of queues");
 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
     CTLFLAG_RD | CTLFLAG_VNET, DC(io_pkt), 0,
     "Number of packets passed to dummynet.");
 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
     CTLFLAG_RD | CTLFLAG_VNET, DC(io_pkt_fast), 0,
     "Number of packets bypassed dummynet scheduler.");
 SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
     CTLFLAG_RD | CTLFLAG_VNET, DC(io_pkt_drop), 0,
     "Number of packets dropped by dummynet.");
 #undef DC
 SYSEND
 
 #endif
 
 static void	dummynet_send(struct mbuf *);
 
 /*
  * Return the mbuf tag holding the dummynet state (it should
  * be the first one on the list).
  */
 struct dn_pkt_tag *
 dn_tag_get(struct mbuf *m)
 {
 	struct m_tag *mtag = m_tag_first(m);
 #ifdef NEW_AQM
 	/* XXX: to skip ts m_tag. For Debugging only*/
 	if (mtag != NULL && mtag->m_tag_id == DN_AQM_MTAG_TS) {
 		m_tag_delete(m,mtag); 
 		mtag = m_tag_first(m);
 		D("skip TS tag");
 	}
 #endif
 	KASSERT(mtag != NULL &&
 	    mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
 	    mtag->m_tag_id == PACKET_TAG_DUMMYNET,
 	    ("packet on dummynet queue w/o dummynet tag!"));
 	return (struct dn_pkt_tag *)(mtag+1);
 }
 
 #ifndef NEW_AQM
 static inline void
 mq_append(struct mq *q, struct mbuf *m)
 {
 #ifdef USERSPACE
 	// buffers from netmap need to be copied
 	// XXX note that the routine is not expected to fail
 	ND("append %p to %p", m, q);
 	if (m->m_flags & M_STACK) {
 		struct mbuf *m_new;
 		void *p;
 		int l, ofs;
 
 		ofs = m->m_data - m->__m_extbuf;
 		// XXX allocate
 		MGETHDR(m_new, M_NOWAIT, MT_DATA);
 		ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p",
 			m, m->__m_extbuf, m->__m_extlen, ofs, m_new);
 		p = m_new->__m_extbuf;	/* new pointer */
 		l = m_new->__m_extlen;	/* new len */
 		if (l <= m->__m_extlen) {
 			panic("extlen too large");
 		}
 
 		*m_new = *m;	// copy
 		m_new->m_flags &= ~M_STACK;
 		m_new->__m_extbuf = p; // point to new buffer
 		_pkt_copy(m->__m_extbuf, p, m->__m_extlen);
 		m_new->m_data = p + ofs;
 		m = m_new;
 	}
 #endif /* USERSPACE */
 	if (q->head == NULL)
 		q->head = m;
 	else
 		q->tail->m_nextpkt = m;
 	q->count++;
 	q->tail = m;
 	m->m_nextpkt = NULL;
 }
 #endif
 
 /*
  * Dispose a list of packet. Use a functions so if we need to do
  * more work, this is a central point to do it.
  */
 void dn_free_pkts(struct mbuf *mnext)
 {
         struct mbuf *m;
     
         while ((m = mnext) != NULL) {
                 mnext = m->m_nextpkt;
                 FREE_PKT(m);
         }
 }
 
 static int
 red_drops (struct dn_queue *q, int len)
 {
 	/*
 	 * RED algorithm
 	 *
 	 * RED calculates the average queue size (avg) using a low-pass filter
 	 * with an exponential weighted (w_q) moving average:
 	 * 	avg  <-  (1-w_q) * avg + w_q * q_size
 	 * where q_size is the queue length (measured in bytes or * packets).
 	 *
 	 * If q_size == 0, we compute the idle time for the link, and set
 	 *	avg = (1 - w_q)^(idle/s)
 	 * where s is the time needed for transmitting a medium-sized packet.
 	 *
 	 * Now, if avg < min_th the packet is enqueued.
 	 * If avg > max_th the packet is dropped. Otherwise, the packet is
 	 * dropped with probability P function of avg.
 	 */
 
 	struct dn_fsk *fs = q->fs;
 	int64_t p_b = 0;
 
 	/* Queue in bytes or packets? */
 	uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ?
 	    q->ni.len_bytes : q->ni.length;
 
 	/* Average queue size estimation. */
 	if (q_size != 0) {
 		/* Queue is not empty, avg <- avg + (q_size - avg) * w_q */
 		int diff = SCALE(q_size) - q->avg;
 		int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q);
 
 		q->avg += (int)v;
 	} else {
 		/*
 		 * Queue is empty, find for how long the queue has been
 		 * empty and use a lookup table for computing
 		 * (1 - * w_q)^(idle_time/s) where s is the time to send a
 		 * (small) packet.
 		 * XXX check wraps...
 		 */
 		if (q->avg) {
 			u_int t = div64((V_dn_cfg.curr_time - q->q_time), fs->lookup_step);
 
 			q->avg = (t < fs->lookup_depth) ?
 			    SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
 		}
 	}
 
 	/* Should i drop? */
 	if (q->avg < fs->min_th) {
 		q->count = -1;
 		return (0);	/* accept packet */
 	}
 	if (q->avg >= fs->max_th) {	/* average queue >=  max threshold */
 		if (fs->fs.flags & DN_IS_ECN)
 			return (1);
 		if (fs->fs.flags & DN_IS_GENTLE_RED) {
 			/*
 			 * According to Gentle-RED, if avg is greater than
 			 * max_th the packet is dropped with a probability
 			 *	 p_b = c_3 * avg - c_4
 			 * where c_3 = (1 - max_p) / max_th
 			 *       c_4 = 1 - 2 * max_p
 			 */
 			p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) -
 			    fs->c_4;
 		} else {
 			q->count = -1;
 			return (1);
 		}
 	} else if (q->avg > fs->min_th) {
 		if (fs->fs.flags & DN_IS_ECN)
 			return (1);
 		/*
 		 * We compute p_b using the linear dropping function
 		 *	 p_b = c_1 * avg - c_2
 		 * where c_1 = max_p / (max_th - min_th)
 		 * 	 c_2 = max_p * min_th / (max_th - min_th)
 		 */
 		p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2;
 	}
 
 	if (fs->fs.flags & DN_QSIZE_BYTES)
 		p_b = div64((p_b * len) , fs->max_pkt_size);
 	if (++q->count == 0)
 		q->random = random() & 0xffff;
 	else {
 		/*
 		 * q->count counts packets arrived since last drop, so a greater
 		 * value of q->count means a greater packet drop probability.
 		 */
 		if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) {
 			q->count = 0;
 			/* After a drop we calculate a new random value. */
 			q->random = random() & 0xffff;
 			return (1);	/* drop */
 		}
 	}
 	/* End of RED algorithm. */
 
 	return (0);	/* accept */
 
 }
 
 /*
  * ECN/ECT Processing (partially adopted from altq)
  */
 #ifndef NEW_AQM
 static
 #endif
 int
 ecn_mark(struct mbuf* m)
 {
 	struct ip *ip;
 	ip = (struct ip *)mtodo(m, dn_tag_get(m)->iphdr_off);
 
 	switch (ip->ip_v) {
 	case IPVERSION:
 	{
 		uint16_t old;
 
 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
 			return (0);	/* not-ECT */
 		if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
 			return (1);	/* already marked */
 
 		/*
 		 * ecn-capable but not marked,
 		 * mark CE and update checksum
 		 */
 		old = *(uint16_t *)ip;
 		ip->ip_tos |= IPTOS_ECN_CE;
 		ip->ip_sum = cksum_adjust(ip->ip_sum, old, *(uint16_t *)ip);
 		return (1);
 	}
 #ifdef INET6
 	case (IPV6_VERSION >> 4):
 	{
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
 		u_int32_t flowlabel;
 
 		flowlabel = ntohl(ip6->ip6_flow);
 		if ((flowlabel >> 28) != 6)
 			return (0);	/* version mismatch! */
 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
 		    (IPTOS_ECN_NOTECT << 20))
 			return (0);	/* not-ECT */
 		if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
 		    (IPTOS_ECN_CE << 20))
 			return (1);	/* already marked */
 		/*
 		 * ecn-capable but not marked, mark CE
 		 */
 		flowlabel |= (IPTOS_ECN_CE << 20);
 		ip6->ip6_flow = htonl(flowlabel);
 		return (1);
 	}
 #endif
 	}
 	return (0);
 }
 
 /*
  * Enqueue a packet in q, subject to space and queue management policy
  * (whose parameters are in q->fs).
  * Update stats for the queue and the scheduler.
  * Return 0 on success, 1 on drop. The packet is consumed anyways.
  */
 int
 dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop)
 {   
 	struct dn_fs *f;
 	struct dn_flow *ni;	/* stats for scheduler instance */
 	uint64_t len;
 
 	if (q->fs == NULL || q->_si == NULL) {
 		printf("%s fs %p si %p, dropping\n",
 			__FUNCTION__, q->fs, q->_si);
 		FREE_PKT(m);
 		return 1;
 	}
 	f = &(q->fs->fs);
 	ni = &q->_si->ni;
 	len = m->m_pkthdr.len;
 	/* Update statistics, then check reasons to drop pkt. */
 	q->ni.tot_bytes += len;
 	q->ni.tot_pkts++;
 	ni->tot_bytes += len;
 	ni->tot_pkts++;
 	if (drop)
 		goto drop;
 	if (f->plr && random() < f->plr)
 		goto drop;
 	if (m->m_pkthdr.rcvif != NULL)
 		m_rcvif_serialize(m);
 #ifdef NEW_AQM
 	/* Call AQM enqueue function */
 	if (q->fs->aqmfp)
 		return q->fs->aqmfp->enqueue(q ,m);
 #endif
 	if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) {
 		if (!(f->flags & DN_IS_ECN) || !ecn_mark(m))
 			goto drop;
 	}
 	if (f->flags & DN_QSIZE_BYTES) {
 		if (q->ni.len_bytes > f->qsize)
 			goto drop;
 	} else if (q->ni.length >= f->qsize) {
 		goto drop;
 	}
 	mq_append(&q->mq, m);
 	q->ni.length++;
 	q->ni.len_bytes += len;
 	ni->length++;
 	ni->len_bytes += len;
 	return (0);
 
 drop:
 	V_dn_cfg.io_pkt_drop++;
 	q->ni.drops++;
 	ni->drops++;
 	FREE_PKT(m);
 	return (1);
 }
 
 /*
  * Fetch packets from the delay line which are due now. If there are
  * leftover packets, reinsert the delay line in the heap.
  * Runs under scheduler lock.
  */
 static void
 transmit_event(struct mq *q, struct delay_line *dline, uint64_t now)
 {
 	struct mbuf *m;
 	struct dn_pkt_tag *pkt = NULL;
 
 	dline->oid.subtype = 0; /* not in heap */
 	while ((m = dline->mq.head) != NULL) {
 		pkt = dn_tag_get(m);
 		if (!DN_KEY_LEQ(pkt->output_time, now))
 			break;
 		dline->mq.head = m->m_nextpkt;
 		dline->mq.count--;
 		if (m->m_pkthdr.rcvif != NULL &&
 		  __predict_false(m_rcvif_restore(m) == NULL))
 			m_freem(m);
 		else
 			mq_append(q, m);
 	}
 	if (m != NULL) {
 		dline->oid.subtype = 1; /* in heap */
 		heap_insert(&V_dn_cfg.evheap, pkt->output_time, dline);
 	}
 }
 
 /*
  * Convert the additional MAC overheads/delays into an equivalent
  * number of bits for the given data rate. The samples are
  * in milliseconds so we need to divide by 1000.
  */
 static uint64_t
 extra_bits(struct mbuf *m, struct dn_schk *s)
 {
 	int index;
 	uint64_t bits;
 	struct dn_profile *pf = s->profile;
 
 	if (!pf || pf->samples_no == 0)
 		return 0;
 	index  = random() % pf->samples_no;
 	bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000);
 	if (index >= pf->loss_level) {
 		struct dn_pkt_tag *dt = dn_tag_get(m);
 		if (dt)
 			dt->dn_dir = DIR_DROP;
 	}
 	return bits;
 }
 
 /*
  * Send traffic from a scheduler instance due by 'now'.
  * Return a pointer to the head of the queue.
  */
 static struct mbuf *
 serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now)
 {
 	struct mq def_q;
 	struct dn_schk *s = si->sched;
 	struct mbuf *m = NULL;
 	int delay_line_idle = (si->dline.mq.head == NULL);
 	int done;
 	uint32_t bw;
 
 	if (q == NULL) {
 		q = &def_q;
 		q->head = NULL;
 	}
 
 	bw = s->link.bandwidth;
 	si->kflags &= ~DN_ACTIVE;
 
 	if (bw > 0)
 		si->credit += (now - si->sched_time) * bw;
 	else
 		si->credit = 0;
 	si->sched_time = now;
 	done = 0;
 	while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) {
 		uint64_t len_scaled;
 
 		done++;
 		len_scaled = (bw == 0) ? 0 : hz *
 			(m->m_pkthdr.len * 8 + extra_bits(m, s));
 		si->credit -= len_scaled;
 		/* Move packet in the delay line */
 		dn_tag_get(m)->output_time = V_dn_cfg.curr_time + s->link.delay ;
 		if (m->m_pkthdr.rcvif != NULL)
 			m_rcvif_serialize(m);
 		mq_append(&si->dline.mq, m);
 	}
 
 	/*
 	 * If credit >= 0 the instance is idle, mark time.
 	 * Otherwise put back in the heap, and adjust the output
 	 * time of the last inserted packet, m, which was too early.
 	 */
 	if (si->credit >= 0) {
 		si->idle_time = now;
 	} else {
 		uint64_t t;
 		KASSERT (bw > 0, ("bw=0 and credit<0 ?"));
 		t = div64(bw - 1 - si->credit, bw);
 		if (m)
 			dn_tag_get(m)->output_time += t;
 		si->kflags |= DN_ACTIVE;
 		heap_insert(&V_dn_cfg.evheap, now + t, si);
 	}
 	if (delay_line_idle && done)
 		transmit_event(q, &si->dline, now);
 	return q->head;
 }
 
 /*
  * The timer handler for dummynet. Time is computed in ticks, but
  * but the code is tolerant to the actual rate at which this is called.
  * Once complete, the function reschedules itself for the next tick.
  */
 void
 dummynet_task(void *context, int pending)
 {
 	struct timeval t;
 	struct mq q = { NULL, NULL }; /* queue to accumulate results */
 	struct epoch_tracker et;
 
 	VNET_ITERATOR_DECL(vnet_iter);
 	VNET_LIST_RLOCK();
 	NET_EPOCH_ENTER(et);
 
 	VNET_FOREACH(vnet_iter) {
 		memset(&q, 0, sizeof(struct mq));
 		CURVNET_SET(vnet_iter);
 
 		if (! V_dn_cfg.init_done) {
 			CURVNET_RESTORE();
 			continue;
 		}
 
 		DN_BH_WLOCK();
 
 		/* Update number of lost(coalesced) ticks. */
 		V_dn_cfg.tick_lost += pending - 1;
 
 		getmicrouptime(&t);
 		/* Last tick duration (usec). */
 		V_dn_cfg.tick_last = (t.tv_sec - V_dn_cfg.prev_t.tv_sec) * 1000000 +
 		(t.tv_usec - V_dn_cfg.prev_t.tv_usec);
 		/* Last tick vs standard tick difference (usec). */
 		V_dn_cfg.tick_delta = (V_dn_cfg.tick_last * hz - 1000000) / hz;
 		/* Accumulated tick difference (usec). */
 		V_dn_cfg.tick_delta_sum += V_dn_cfg.tick_delta;
 
 		V_dn_cfg.prev_t = t;
 
 		/*
 		* Adjust curr_time if the accumulated tick difference is
 		* greater than the 'standard' tick. Since curr_time should
 		* be monotonically increasing, we do positive adjustments
 		* as required, and throttle curr_time in case of negative
 		* adjustment.
 		*/
 		V_dn_cfg.curr_time++;
 		if (V_dn_cfg.tick_delta_sum - tick >= 0) {
 			int diff = V_dn_cfg.tick_delta_sum / tick;
 
 			V_dn_cfg.curr_time += diff;
 			V_dn_cfg.tick_diff += diff;
 			V_dn_cfg.tick_delta_sum %= tick;
 			V_dn_cfg.tick_adjustment++;
 		} else if (V_dn_cfg.tick_delta_sum + tick <= 0) {
 			V_dn_cfg.curr_time--;
 			V_dn_cfg.tick_diff--;
 			V_dn_cfg.tick_delta_sum += tick;
 			V_dn_cfg.tick_adjustment++;
 		}
 
 		/* serve pending events, accumulate in q */
 		for (;;) {
 			struct dn_id *p;    /* generic parameter to handler */
 
 			if (V_dn_cfg.evheap.elements == 0 ||
 			    DN_KEY_LT(V_dn_cfg.curr_time, HEAP_TOP(&V_dn_cfg.evheap)->key))
 				break;
 			p = HEAP_TOP(&V_dn_cfg.evheap)->object;
 			heap_extract(&V_dn_cfg.evheap, NULL);
 			if (p->type == DN_SCH_I) {
 				serve_sched(&q, (struct dn_sch_inst *)p, V_dn_cfg.curr_time);
 			} else { /* extracted a delay line */
 				transmit_event(&q, (struct delay_line *)p, V_dn_cfg.curr_time);
 			}
 		}
 		if (V_dn_cfg.expire && ++V_dn_cfg.expire_cycle >= V_dn_cfg.expire) {
 			V_dn_cfg.expire_cycle = 0;
 			dn_drain_scheduler();
 			dn_drain_queue();
 		}
 		DN_BH_WUNLOCK();
 		if (q.head != NULL)
 			dummynet_send(q.head);
 
 		CURVNET_RESTORE();
 	}
 	NET_EPOCH_EXIT(et);
 	VNET_LIST_RUNLOCK();
 
 	/* Schedule our next run. */
 	dn_reschedule();
 }
 
 /*
  * forward a chain of packets to the proper destination.
  * This runs outside the dummynet lock.
  */
 static void
 dummynet_send(struct mbuf *m)
 {
 	struct mbuf *n;
 
 	NET_EPOCH_ASSERT();
 
 	for (; m != NULL; m = n) {
 		struct ifnet *ifp = NULL;	/* gcc 3.4.6 complains */
         	struct m_tag *tag;
 		int dst;
 
 		n = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		tag = m_tag_first(m);
 		if (tag == NULL) { /* should not happen */
 			dst = DIR_DROP;
 		} else {
 			struct dn_pkt_tag *pkt = dn_tag_get(m);
 			/* extract the dummynet info, rename the tag
 			 * to carry reinject info.
 			 */
 			ifp = ifnet_byindexgen(pkt->if_index, pkt->if_idxgen);
 			if (((pkt->dn_dir == (DIR_OUT | PROTO_LAYER2)) ||
 			    (pkt->dn_dir == (DIR_OUT | PROTO_LAYER2 | PROTO_IPV6))) &&
 				ifp == NULL) {
 				dst = DIR_DROP;
 			} else {
 				dst = pkt->dn_dir;
 				tag->m_tag_cookie = MTAG_IPFW_RULE;
 				tag->m_tag_id = 0;
 			}
 		}
 
 		switch (dst) {
 		case DIR_OUT:
 			ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL);
 			break ;
 
 		case DIR_IN :
 			netisr_dispatch(NETISR_IP, m);
 			break;
 
 #ifdef INET6
 		case DIR_IN | PROTO_IPV6:
 			netisr_dispatch(NETISR_IPV6, m);
 			break;
 
 		case DIR_OUT | PROTO_IPV6:
 			ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
 			break;
 #endif
 
 		case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */
 			if (bridge_dn_p != NULL)
 				((*bridge_dn_p)(m, ifp));
 			else
 				printf("dummynet: if_bridge not loaded\n");
 
 			break;
 
 		case DIR_IN | PROTO_LAYER2 | PROTO_IPV6:
 		case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */
 			/*
 			 * The Ethernet code assumes the Ethernet header is
 			 * contiguous in the first mbuf header.
 			 * Insure this is true.
 			 */
 			if (m->m_len < ETHER_HDR_LEN &&
 			    (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
 				printf("dummynet/ether: pullup failed, "
 				    "dropping packet\n");
 				break;
 			}
 			ether_demux(m->m_pkthdr.rcvif, m);
 			break;
 
 		case DIR_OUT | PROTO_LAYER2 | PROTO_IPV6:
 		case DIR_OUT | PROTO_LAYER2: /* DN_TO_ETH_OUT: */
 			MPASS(ifp != NULL);
 			ether_output_frame(ifp, m);
 			break;
 
 		case DIR_DROP:
 			/* drop the packet after some time */
 			FREE_PKT(m);
 			break;
 
 		default:
 			printf("dummynet: bad switch %d!\n", dst);
 			FREE_PKT(m);
 			break;
 		}
 	}
 }
 
 static inline int
 tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa)
 {
 	struct dn_pkt_tag *dt;
 	struct m_tag *mtag;
 
 	mtag = m_tag_get(PACKET_TAG_DUMMYNET,
 		    sizeof(*dt), M_NOWAIT | M_ZERO);
 	if (mtag == NULL)
 		return 1;		/* Cannot allocate packet header. */
 	m_tag_prepend(m, mtag);		/* Attach to mbuf chain. */
 	dt = (struct dn_pkt_tag *)(mtag + 1);
 	dt->rule = fwa->rule;
 	/* only keep this info */
 	dt->rule.info &= (IPFW_ONEPASS | IPFW_IS_DUMMYNET);
 	dt->dn_dir = dir;
 	if (fwa->flags & IPFW_ARGS_OUT && fwa->ifp != NULL) {
 		NET_EPOCH_ASSERT();
 		dt->if_index = fwa->ifp->if_index;
 		dt->if_idxgen = fwa->ifp->if_idxgen;
 	}
 	/* dt->output tame is updated as we move through */
 	dt->output_time = V_dn_cfg.curr_time;
 	dt->iphdr_off = (dir & PROTO_LAYER2) ? ETHER_HDR_LEN : 0;
 	return 0;
 }
 
 /*
  * dummynet hook for packets.
  * We use the argument to locate the flowset fs and the sched_set sch
  * associated to it. The we apply flow_mask and sched_mask to
  * determine the queue and scheduler instances.
  */
 int
 dummynet_io(struct mbuf **m0, struct ip_fw_args *fwa)
 {
 	struct mbuf *m = *m0;
 	struct dn_fsk *fs = NULL;
 	struct dn_sch_inst *si;
 	struct dn_queue *q = NULL;	/* default */
 	int fs_id, dir;
 
 	fs_id = (fwa->rule.info & IPFW_INFO_MASK) +
 		((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0);
 	/* XXXGL: convert args to dir */
 	if (fwa->flags & IPFW_ARGS_IN)
 		dir = DIR_IN;
 	else
 		dir = DIR_OUT;
 	if (fwa->flags & IPFW_ARGS_ETHER)
 		dir |= PROTO_LAYER2;
 	else if (fwa->flags & IPFW_ARGS_IP6)
 		dir |= PROTO_IPV6;
 	DN_BH_WLOCK();
 	V_dn_cfg.io_pkt++;
 	/* we could actually tag outside the lock, but who cares... */
 	if (tag_mbuf(m, dir, fwa))
 		goto dropit;
 	/* XXX locate_flowset could be optimised with a direct ref. */
 	fs = dn_ht_find(V_dn_cfg.fshash, fs_id, 0, NULL);
 	if (fs == NULL)
 		goto dropit;	/* This queue/pipe does not exist! */
 	if (fs->sched == NULL)	/* should not happen */
 		goto dropit;
 	/* find scheduler instance, possibly applying sched_mask */
 	si = ipdn_si_find(fs->sched, &(fwa->f_id));
 	if (si == NULL)
 		goto dropit;
 	/*
 	 * If the scheduler supports multiple queues, find the right one
 	 * (otherwise it will be ignored by enqueue).
 	 */
 	if (fs->sched->fp->flags & DN_MULTIQUEUE) {
 		q = ipdn_q_find(fs, si, &(fwa->f_id));
 		if (q == NULL)
 			goto dropit;
 	}
 	if (fs->sched->fp->enqueue(si, q, m)) {
 		/* packet was dropped by enqueue() */
 		m = *m0 = NULL;
 
 		/* dn_enqueue already increases io_pkt_drop */
 		V_dn_cfg.io_pkt_drop--;
 
 		goto dropit;
 	}
 
 	if (si->kflags & DN_ACTIVE) {
 		m = *m0 = NULL; /* consumed */
 		goto done; /* already active, nothing to do */
 	}
 
 	/* compute the initial allowance */
 	if (si->idle_time < V_dn_cfg.curr_time) {
 	    /* Do this only on the first packet on an idle pipe */
 	    struct dn_link *p = &fs->sched->link;
 
 	    si->sched_time = V_dn_cfg.curr_time;
 	    si->credit = V_dn_cfg.io_fast ? p->bandwidth : 0;
 	    if (p->burst) {
 		uint64_t burst = (V_dn_cfg.curr_time - si->idle_time) * p->bandwidth;
 		if (burst > p->burst)
 			burst = p->burst;
 		si->credit += burst;
 	    }
 	}
 	/* pass through scheduler and delay line */
 	m = serve_sched(NULL, si, V_dn_cfg.curr_time);
 
 	/* optimization -- pass it back to ipfw for immediate send */
 	/* XXX Don't call dummynet_send() if scheduler return the packet
 	 *     just enqueued. This avoid a lock order reversal.
 	 *     
 	 */
 	if (/*V_dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) {
 		/* fast io, rename the tag * to carry reinject info. */
 		struct m_tag *tag = m_tag_first(m);
 
 		tag->m_tag_cookie = MTAG_IPFW_RULE;
 		tag->m_tag_id = 0;
 		V_dn_cfg.io_pkt_fast++;
 		if (m->m_nextpkt != NULL) {
 			printf("dummynet: fast io: pkt chain detected!\n");
 			m->m_nextpkt = NULL;
 		}
 		m = NULL;
 	} else {
 		*m0 = NULL;
 	}
 done:
 	DN_BH_WUNLOCK();
 	if (m)
 		dummynet_send(m);
 	return 0;
 
 dropit:
 	V_dn_cfg.io_pkt_drop++;
 	DN_BH_WUNLOCK();
 	if (m)
 		FREE_PKT(m);
 	*m0 = NULL;
 	return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS;
 }
diff --git a/sys/netpfil/ipfw/ip_fw2.c b/sys/netpfil/ipfw/ip_fw2.c
index 49f64a851e5f..f2e914e24007 100644
--- a/sys/netpfil/ipfw/ip_fw2.c
+++ b/sys/netpfil/ipfw/ip_fw2.c
@@ -1,3672 +1,3673 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * The FreeBSD IP packet firewall, main file
  */
 
 #include "opt_ipfw.h"
 #include "opt_ipdivert.h"
 #include "opt_inet.h"
 #ifndef INET
 #error "IPFIREWALL requires INET"
 #endif /* INET */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/counter.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/jail.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 #include <net/ethernet.h> /* for ETHERTYPE_IP */
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netpfil/pf/pf_mtag.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_carp.h>
 #include <netinet/pim.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #include <netinet/sctp_header.h>
 
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/in_fib.h>
 #ifdef INET6
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #include <net/if_gre.h> /* for struct grehdr */
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #include <machine/in_cksum.h>	/* XXX for in_cksum */
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 #define	IPFW_PROBE(probe, arg0, arg1, arg2, arg3, arg4, arg5)		\
     SDT_PROBE6(ipfw, , , probe, arg0, arg1, arg2, arg3, arg4, arg5)
 
 SDT_PROVIDER_DEFINE(ipfw);
 SDT_PROBE_DEFINE6(ipfw, , , rule__matched,
     "int",			/* retval */
     "int",			/* af */
     "void *",			/* src addr */
     "void *",			/* dst addr */
     "struct ip_fw_args *",	/* args */
     "struct ip_fw *"		/* rule */);
 
 /*
  * static variables followed by global ones.
  * All ipfw global variables are here.
  */
 
 VNET_DEFINE_STATIC(int, fw_deny_unknown_exthdrs);
 #define	V_fw_deny_unknown_exthdrs	VNET(fw_deny_unknown_exthdrs)
 
 VNET_DEFINE_STATIC(int, fw_permit_single_frag6) = 1;
 #define	V_fw_permit_single_frag6	VNET(fw_permit_single_frag6)
 
 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
 static int default_to_accept = 1;
 #else
 static int default_to_accept;
 #endif
 
 VNET_DEFINE(int, autoinc_step);
 VNET_DEFINE(int, fw_one_pass) = 1;
 
 VNET_DEFINE(unsigned int, fw_tables_max);
 VNET_DEFINE(unsigned int, fw_tables_sets) = 0;	/* Don't use set-aware tables */
 /* Use 128 tables by default */
 static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT;
 
 static int jump_lookup_pos(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards);
 #ifndef LINEAR_SKIPTO
 static int jump_cached(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards);
 #define	JUMP(ch, f, num, targ, back)	jump_cached(ch, f, num, targ, back)
 #else
 #define	JUMP(ch, f, num, targ, back)	jump_lookup_pos(ch, f, num, targ, back)
 #endif
 
 /*
  * Each rule belongs to one of 32 different sets (0..31).
  * The variable set_disable contains one bit per set.
  * If the bit is set, all rules in the corresponding set
  * are disabled. Set RESVD_SET(31) is reserved for the default rule
  * and rules that are not deleted by the flush command,
  * and CANNOT be disabled.
  * Rules in set RESVD_SET can only be deleted individually.
  */
 VNET_DEFINE(u_int32_t, set_disable);
 #define	V_set_disable			VNET(set_disable)
 
 VNET_DEFINE(int, fw_verbose);
 /* counter for ipfw_log(NULL...) */
 VNET_DEFINE(u_int64_t, norule_counter);
 VNET_DEFINE(int, verbose_limit);
 
 /* layer3_chain contains the list of rules for layer 3 */
 VNET_DEFINE(struct ip_fw_chain, layer3_chain);
 
 /* ipfw_vnet_ready controls when we are open for business */
 VNET_DEFINE(int, ipfw_vnet_ready) = 0;
 
 VNET_DEFINE(int, ipfw_nat_ready) = 0;
 
 ipfw_nat_t *ipfw_nat_ptr = NULL;
 struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
 ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_del_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
 
 #ifdef SYSCTL_NODE
 uint32_t dummy_def = IPFW_DEFAULT_RULE;
 static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS);
 static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS);
 
 SYSBEGIN(f3)
 
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Firewall");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
     "Only do a single pass through ipfw when using dummynet(4)");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
     "Rule number auto-increment step");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
     "Log matches to ipfw rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
     "Set upper limit of matches of ipfw rules logged");
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
     &dummy_def, 0,
     "The default/max possible rule number.");
 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_ipfw_table_num, "IU",
     "Maximum number of concurrently used tables");
 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_ipfw_tables_sets, "IU",
     "Use per-set namespace for tables");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
     &default_to_accept, 0,
     "Make the default rule accept all packets.");
 TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables);
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count,
     CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
     "Number of static rules");
 
 #ifdef INET6
 SYSCTL_DECL(_net_inet6_ip6);
 SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Firewall");
 SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
     &VNET_NAME(fw_deny_unknown_exthdrs), 0,
     "Deny packets with unknown IPv6 Extension Headers");
 SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
     &VNET_NAME(fw_permit_single_frag6), 0,
     "Permit single packet IPv6 fragments");
 #endif /* INET6 */
 
 SYSEND
 
 #endif /* SYSCTL_NODE */
 
 /*
  * Some macros used in the various matching options.
  * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
  * Other macros just cast void * into the appropriate type
  */
 #define	L3HDR(T, ip)	((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
 #define	TCP(p)		((struct tcphdr *)(p))
 #define	SCTP(p)		((struct sctphdr *)(p))
 #define	UDP(p)		((struct udphdr *)(p))
 #define	ICMP(p)		((struct icmphdr *)(p))
 #define	ICMP6(p)	((struct icmp6_hdr *)(p))
 
 static __inline int
 icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
 }
 
 #define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
     (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
 
 static int
 is_icmp_query(struct icmphdr *icmp)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
 }
 #undef TT
 
 /*
  * The following checks use two arrays of 8 or 16 bits to store the
  * bits that we want set or clear, respectively. They are in the
  * low and high half of cmd->arg1 or cmd->d[0].
  *
  * We scan options and store the bits we find set. We succeed if
  *
  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
  *
  * The code is sometimes optimized not to store additional variables.
  */
 
 static int
 flags_match(ipfw_insn *cmd, u_int8_t bits)
 {
 	u_char want_clear;
 	bits = ~bits;
 
 	if ( ((cmd->arg1 & 0xff) & bits) != 0)
 		return 0; /* some bits we want set were clear */
 	want_clear = (cmd->arg1 >> 8) & 0xff;
 	if ( (want_clear & bits) != want_clear)
 		return 0; /* some bits we want clear were set */
 	return 1;
 }
 
 static int
 ipopts_match(struct ip *ip, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	u_char *cp = (u_char *)(ip + 1);
 	int x = (ip->ip_hl << 2) - sizeof (struct ip);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[IPOPT_OPTVAL];
 
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[IPOPT_OLEN];
 			if (optlen <= 0 || optlen > x)
 				return 0; /* invalid or truncated */
 		}
 		switch (opt) {
 		default:
 			break;
 
 		case IPOPT_LSRR:
 			bits |= IP_FW_IPOPT_LSRR;
 			break;
 
 		case IPOPT_SSRR:
 			bits |= IP_FW_IPOPT_SSRR;
 			break;
 
 		case IPOPT_RR:
 			bits |= IP_FW_IPOPT_RR;
 			break;
 
 		case IPOPT_TS:
 			bits |= IP_FW_IPOPT_TS;
 			break;
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 /*
  * Parse TCP options. The logic copied from tcp_dooptions().
  */
 static int
 tcpopts_parse(const struct tcphdr *tcp, uint16_t *mss)
 {
 	const u_char *cp = (const u_char *)(tcp + 1);
 	int optlen, bits = 0;
 	int cnt = (tcp->th_off << 2) - sizeof(struct tcphdr);
 
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		int opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = cp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 
 		switch (opt) {
 		default:
 			break;
 
 		case TCPOPT_MAXSEG:
 			if (optlen != TCPOLEN_MAXSEG)
 				break;
 			bits |= IP_FW_TCPOPT_MSS;
 			if (mss != NULL)
 				*mss = be16dec(cp + 2);
 			break;
 
 		case TCPOPT_WINDOW:
 			if (optlen == TCPOLEN_WINDOW)
 				bits |= IP_FW_TCPOPT_WINDOW;
 			break;
 
 		case TCPOPT_SACK_PERMITTED:
 			if (optlen == TCPOLEN_SACK_PERMITTED)
 				bits |= IP_FW_TCPOPT_SACK;
 			break;
 
 		case TCPOPT_SACK:
 			if (optlen > 2 && (optlen - 2) % TCPOLEN_SACK == 0)
 				bits |= IP_FW_TCPOPT_SACK;
 			break;
 
 		case TCPOPT_TIMESTAMP:
 			if (optlen == TCPOLEN_TIMESTAMP)
 				bits |= IP_FW_TCPOPT_TS;
 			break;
 		}
 	}
 	return (bits);
 }
 
 static int
 tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
 {
 
 	return (flags_match(cmd, tcpopts_parse(tcp, NULL)));
 }
 
 static int
 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain,
     uint32_t *tablearg)
 {
 
 	if (ifp == NULL)	/* no iface with this packet, match fails */
 		return (0);
 
 	/* Check by name or by IP address */
 	if (cmd->name[0] != '\0') { /* match by name */
 		if (cmd->name[0] == '\1') /* use tablearg to match */
 			return ipfw_lookup_table(chain, cmd->p.kidx, 0,
 			    &ifp->if_index, tablearg);
 		/* Check name */
 		if (cmd->p.glob) {
 			if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
 				return(1);
 		} else {
 			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
 				return(1);
 		}
 	} else {
 #if !defined(USERSPACE) && defined(__FreeBSD__)	/* and OSX too ? */
 		struct ifaddr *ia;
 
 		NET_EPOCH_ASSERT();
 
 		CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
 			if (ia->ifa_addr->sa_family != AF_INET)
 				continue;
 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
 			    (ia->ifa_addr))->sin_addr.s_addr)
 				return (1);	/* match */
 		}
 #endif /* __FreeBSD__ */
 	}
 	return(0);	/* no match, fail ... */
 }
 
 /*
  * The verify_path function checks if a route to the src exists and
  * if it is reachable via ifp (when provided).
  * 
  * The 'verrevpath' option checks that the interface that an IP packet
  * arrives on is the same interface that traffic destined for the
  * packet's source address would be routed out of.
  * The 'versrcreach' option just checks that the source address is
  * reachable via any route (except default) in the routing table.
  * These two are a measure to block forged packets. This is also
  * commonly known as "anti-spoofing" or Unicast Reverse Path
  * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
  * is purposely reminiscent of the Cisco IOS command,
  *
  *   ip verify unicast reverse-path
  *   ip verify unicast source reachable-via any
  *
  * which implements the same functionality. But note that the syntax
  * is misleading, and the check may be performed on all IP packets
  * whether unicast, multicast, or broadcast.
  */
 static int
 verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
 {
 #if defined(USERSPACE) || !defined(__FreeBSD__)
 	return 0;
 #else
 	struct nhop_object *nh;
 
 	nh = fib4_lookup(fib, src, 0, NHR_NONE, 0);
 	if (nh == NULL)
 		return (0);
 
 	/*
 	 * If ifp is provided, check for equality with rtentry.
 	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
 	 * in order to pass packets injected back by if_simloop():
 	 * routing entry (via lo0) for our own address
 	 * may exist, so we need to handle routing assymetry.
 	 */
 	if (ifp != NULL && ifp != nh->nh_aifp)
 		return (0);
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL && (nh->nh_flags & NHF_DEFAULT) != 0)
 		return (0);
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && (nh->nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0)
 		return (0);
 
 	/* found valid route */
 	return 1;
 #endif /* __FreeBSD__ */
 }
 
 /*
  * Generate an SCTP packet containing an ABORT chunk. The verification tag
  * is given by vtag. The T-bit is set in the ABORT chunk if and only if
  * reflected is not 0.
  */
 
 static struct mbuf *
 ipfw_send_abort(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t vtag,
     int reflected)
 {
 	struct mbuf *m;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct sctphdr *sctp;
 	struct sctp_chunkhdr *chunk;
 	u_int16_t hlen, plen, tlen;
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 	M_SETFIB(m, id->fib);
 #ifdef MAC
 	if (replyto != NULL)
 		mac_netinet_firewall_reply(replyto, m);
 	else
 		mac_netinet_firewall_send(m);
 #else
 	(void)replyto;		/* don't warn about unused arg */
 #endif
 
 	switch (id->addr_type) {
 	case 4:
 		hlen = sizeof(struct ip);
 		break;
 #ifdef INET6
 	case 6:
 		hlen = sizeof(struct ip6_hdr);
 		break;
 #endif
 	default:
 		/* XXX: log me?!? */
 		FREE_PKT(m);
 		return (NULL);
 	}
 	plen = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
 	tlen = hlen + plen;
 	m->m_data += max_linkhdr;
 	m->m_flags |= M_SKIP_FIREWALL;
 	m->m_pkthdr.len = m->m_len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 	bzero(m->m_data, tlen);
 
 	switch (id->addr_type) {
 	case 4:
 		ip = mtod(m, struct ip *);
 
 		ip->ip_v = 4;
 		ip->ip_hl = sizeof(struct ip) >> 2;
 		ip->ip_tos = IPTOS_LOWDELAY;
 		ip->ip_len = htons(tlen);
 		ip->ip_id = htons(0);
 		ip->ip_off = htons(0);
 		ip->ip_ttl = V_ip_defttl;
 		ip->ip_p = IPPROTO_SCTP;
 		ip->ip_sum = 0;
 		ip->ip_src.s_addr = htonl(id->dst_ip);
 		ip->ip_dst.s_addr = htonl(id->src_ip);
 
 		sctp = (struct sctphdr *)(ip + 1);
 		break;
 #ifdef INET6
 	case 6:
 		ip6 = mtod(m, struct ip6_hdr *);
 
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_plen = htons(plen);
 		ip6->ip6_nxt = IPPROTO_SCTP;
 		ip6->ip6_hlim = IPV6_DEFHLIM;
 		ip6->ip6_src = id->dst_ip6;
 		ip6->ip6_dst = id->src_ip6;
 
 		sctp = (struct sctphdr *)(ip6 + 1);
 		break;
 #endif
 	}
 
 	sctp->src_port = htons(id->dst_port);
 	sctp->dest_port = htons(id->src_port);
 	sctp->v_tag = htonl(vtag);
 	sctp->checksum = htonl(0);
 
 	chunk = (struct sctp_chunkhdr *)(sctp + 1);
 	chunk->chunk_type = SCTP_ABORT_ASSOCIATION;
 	chunk->chunk_flags = 0;
 	if (reflected != 0) {
 		chunk->chunk_flags |= SCTP_HAD_NO_TCB;
 	}
 	chunk->chunk_length = htons(sizeof(struct sctp_chunkhdr));
 
 	sctp->checksum = sctp_calculate_cksum(m, hlen);
 
 	return (m);
 }
 
 /*
  * Generate a TCP packet, containing either a RST or a keepalive.
  * When flags & TH_RST, we are sending a RST packet, because of a
  * "reset" action matched the packet.
  * Otherwise we are sending a keepalive, and flags & TH_
  * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
  * so that MAC can label the reply appropriately.
  */
 struct mbuf *
 ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
     u_int32_t ack, int flags)
 {
 	struct mbuf *m = NULL;		/* stupid compiler */
 	struct ip *h = NULL;		/* stupid compiler */
 #ifdef INET6
 	struct ip6_hdr *h6 = NULL;
 #endif
 	struct tcphdr *th = NULL;
 	int len, dir;
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 	M_SETFIB(m, id->fib);
 #ifdef MAC
 	if (replyto != NULL)
 		mac_netinet_firewall_reply(replyto, m);
 	else
 		mac_netinet_firewall_send(m);
 #else
 	(void)replyto;		/* don't warn about unused arg */
 #endif
 
 	switch (id->addr_type) {
 	case 4:
 		len = sizeof(struct ip) + sizeof(struct tcphdr);
 		break;
 #ifdef INET6
 	case 6:
 		len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		break;
 #endif
 	default:
 		/* XXX: log me?!? */
 		FREE_PKT(m);
 		return (NULL);
 	}
 	dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);
 
 	m->m_data += max_linkhdr;
 	m->m_flags |= M_SKIP_FIREWALL;
 	m->m_pkthdr.len = m->m_len = len;
 	m->m_pkthdr.rcvif = NULL;
 	bzero(m->m_data, len);
 
 	switch (id->addr_type) {
 	case 4:
 		h = mtod(m, struct ip *);
 
 		/* prepare for checksum */
 		h->ip_p = IPPROTO_TCP;
 		h->ip_len = htons(sizeof(struct tcphdr));
 		if (dir) {
 			h->ip_src.s_addr = htonl(id->src_ip);
 			h->ip_dst.s_addr = htonl(id->dst_ip);
 		} else {
 			h->ip_src.s_addr = htonl(id->dst_ip);
 			h->ip_dst.s_addr = htonl(id->src_ip);
 		}
 
 		th = (struct tcphdr *)(h + 1);
 		break;
 #ifdef INET6
 	case 6:
 		h6 = mtod(m, struct ip6_hdr *);
 
 		/* prepare for checksum */
 		h6->ip6_nxt = IPPROTO_TCP;
 		h6->ip6_plen = htons(sizeof(struct tcphdr));
 		if (dir) {
 			h6->ip6_src = id->src_ip6;
 			h6->ip6_dst = id->dst_ip6;
 		} else {
 			h6->ip6_src = id->dst_ip6;
 			h6->ip6_dst = id->src_ip6;
 		}
 
 		th = (struct tcphdr *)(h6 + 1);
 		break;
 #endif
 	}
 
 	if (dir) {
 		th->th_sport = htons(id->src_port);
 		th->th_dport = htons(id->dst_port);
 	} else {
 		th->th_sport = htons(id->dst_port);
 		th->th_dport = htons(id->src_port);
 	}
 	th->th_off = sizeof(struct tcphdr) >> 2;
 
 	if (flags & TH_RST) {
 		if (flags & TH_ACK) {
 			th->th_seq = htonl(ack);
 			th->th_flags = TH_RST;
 		} else {
 			if (flags & TH_SYN)
 				seq++;
 			th->th_ack = htonl(seq);
 			th->th_flags = TH_RST | TH_ACK;
 		}
 	} else {
 		/*
 		 * Keepalive - use caller provided sequence numbers
 		 */
 		th->th_seq = htonl(seq);
 		th->th_ack = htonl(ack);
 		th->th_flags = TH_ACK;
 	}
 
 	switch (id->addr_type) {
 	case 4:
 		th->th_sum = in_cksum(m, len);
 
 		/* finish the ip header */
 		h->ip_v = 4;
 		h->ip_hl = sizeof(*h) >> 2;
 		h->ip_tos = IPTOS_LOWDELAY;
 		h->ip_off = htons(0);
 		h->ip_len = htons(len);
 		h->ip_ttl = V_ip_defttl;
 		h->ip_sum = 0;
 		break;
 #ifdef INET6
 	case 6:
 		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
 		    sizeof(struct tcphdr));
 
 		/* finish the ip6 header */
 		h6->ip6_vfc |= IPV6_VERSION;
 		h6->ip6_hlim = IPV6_DEFHLIM;
 		break;
 #endif
 	}
 
 	return (m);
 }
 
 #ifdef INET6
 /*
  * ipv6 specific rules here...
  */
 static __inline int
 icmp6type_match(int type, ipfw_insn_u32 *cmd)
 {
 	return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
 }
 
 static int
 flow6id_match(int curr_flow, ipfw_insn_u32 *cmd)
 {
 	int i;
 	for (i=0; i <= cmd->o.arg1; ++i)
 		if (curr_flow == cmd->d[i])
 			return 1;
 	return 0;
 }
 
 /* support for IP6_*_ME opcodes */
 static const struct in6_addr lla_mask = {{{
 	0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 }}};
 
 static int
 ipfw_localip6(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	if (IN6_IS_ADDR_MULTICAST(in6))
 		return (0);
 
 	if (!IN6_IS_ADDR_LINKLOCAL(in6))
 		return (in6_localip(in6));
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr))
 			continue;
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
 		    in6, &lla_mask)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return (1);
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (0);
 }
 
 static int
 verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib)
 {
 	struct nhop_object *nh;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(src))
 		return (1);
 
 	nh = fib6_lookup(fib, src, 0, NHR_NONE, 0);
 	if (nh == NULL)
 		return (0);
 
 	/* If ifp is provided, check for equality with route table. */
 	if (ifp != NULL && ifp != nh->nh_aifp)
 		return (0);
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL && (nh->nh_flags & NHF_DEFAULT) != 0)
 		return (0);
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && (nh->nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0)
 		return (0);
 
 	/* found valid route */
 	return 1;
 }
 
 static int
 is_icmp6_query(int icmp6_type)
 {
 	if ((icmp6_type <= ICMP6_MAXTYPE) &&
 	    (icmp6_type == ICMP6_ECHO_REQUEST ||
 	    icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
 	    icmp6_type == ICMP6_WRUREQUEST ||
 	    icmp6_type == ICMP6_FQDN_QUERY ||
 	    icmp6_type == ICMP6_NI_QUERY))
 		return (1);
 
 	return (0);
 }
 
 static int
 map_icmp_unreach(int code)
 {
 
 	/* RFC 7915 p4.2 */
 	switch (code) {
 	case ICMP_UNREACH_NET:
 	case ICMP_UNREACH_HOST:
 	case ICMP_UNREACH_SRCFAIL:
 	case ICMP_UNREACH_NET_UNKNOWN:
 	case ICMP_UNREACH_HOST_UNKNOWN:
 	case ICMP_UNREACH_TOSNET:
 	case ICMP_UNREACH_TOSHOST:
 		return (ICMP6_DST_UNREACH_NOROUTE);
 	case ICMP_UNREACH_PORT:
 		return (ICMP6_DST_UNREACH_NOPORT);
 	default:
 		/*
 		 * Map the rest of codes into admit prohibited.
 		 * XXX: unreach proto should be mapped into ICMPv6
 		 * parameter problem, but we use only unreach type.
 		 */
 		return (ICMP6_DST_UNREACH_ADMIN);
 	}
 }
 
 static void
 send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
 {
 	struct mbuf *m;
 
 	m = args->m;
 	if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *tcp;
 		tcp = (struct tcphdr *)((char *)ip6 + hlen);
 
 		if ((tcp->th_flags & TH_RST) == 0) {
 			struct mbuf *m0;
 			m0 = ipfw_send_pkt(args->m, &(args->f_id),
 			    ntohl(tcp->th_seq), ntohl(tcp->th_ack),
 			    tcp->th_flags | TH_RST);
 			if (m0 != NULL)
 				ip6_output(m0, NULL, NULL, 0, NULL, NULL,
 				    NULL);
 		}
 		FREE_PKT(m);
 	} else if (code == ICMP6_UNREACH_ABORT &&
 	    args->f_id.proto == IPPROTO_SCTP) {
 		struct mbuf *m0;
 		struct sctphdr *sctp;
 		u_int32_t v_tag;
 		int reflected;
 
 		sctp = (struct sctphdr *)((char *)ip6 + hlen);
 		reflected = 1;
 		v_tag = ntohl(sctp->v_tag);
 		/* Investigate the first chunk header if available */
 		if (m->m_len >= hlen + sizeof(struct sctphdr) +
 		    sizeof(struct sctp_chunkhdr)) {
 			struct sctp_chunkhdr *chunk;
 
 			chunk = (struct sctp_chunkhdr *)(sctp + 1);
 			switch (chunk->chunk_type) {
 			case SCTP_INITIATION:
 				/*
 				 * Packets containing an INIT chunk MUST have
 				 * a zero v-tag.
 				 */
 				if (v_tag != 0) {
 					v_tag = 0;
 					break;
 				}
 				/* INIT chunk MUST NOT be bundled */
 				if (m->m_pkthdr.len >
 				    hlen + sizeof(struct sctphdr) +
 				    ntohs(chunk->chunk_length) + 3) {
 					break;
 				}
 				/* Use the initiate tag if available */
 				if ((m->m_len >= hlen + sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))) {
 					struct sctp_init *init;
 
 					init = (struct sctp_init *)(chunk + 1);
 					v_tag = ntohl(init->initiate_tag);
 					reflected = 0;
 				}
 				break;
 			case SCTP_ABORT_ASSOCIATION:
 				/*
 				 * If the packet contains an ABORT chunk, don't
 				 * reply.
 				 * XXX: We should search through all chunks,
 				 * but do not do that to avoid attacks.
 				 */
 				v_tag = 0;
 				break;
 			}
 		}
 		if (v_tag == 0) {
 			m0 = NULL;
 		} else {
 			m0 = ipfw_send_abort(args->m, &(args->f_id), v_tag,
 			    reflected);
 		}
 		if (m0 != NULL)
 			ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
 		FREE_PKT(m);
 	} else if (code != ICMP6_UNREACH_RST && code != ICMP6_UNREACH_ABORT) {
 		/* Send an ICMPv6 unreach. */
 #if 0
 		/*
 		 * Unlike above, the mbufs need to line up with the ip6 hdr,
 		 * as the contents are read. We need to m_adj() the
 		 * needed amount.
 		 * The mbuf will however be thrown away so we can adjust it.
 		 * Remember we did an m_pullup on it already so we
 		 * can make some assumptions about contiguousness.
 		 */
 		if (args->L3offset)
 			m_adj(m, args->L3offset);
 #endif
 		icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
 	} else
 		FREE_PKT(m);
 
 	args->m = NULL;
 }
 
 #endif /* INET6 */
 
 /*
  * sends a reject message, consuming the mbuf passed as an argument.
  */
 static void
 send_reject(struct ip_fw_args *args, const ipfw_insn *cmd, int iplen,
     struct ip *ip)
 {
 	int code, mtu;
 
 	code = cmd->arg1;
 	if (code == ICMP_UNREACH_NEEDFRAG &&
 	    cmd->len == F_INSN_SIZE(ipfw_insn_u16))
 		mtu = ((const ipfw_insn_u16 *)cmd)->ports[0];
 	else
 		mtu = 0;
 
 #if 0
 	/* XXX When ip is not guaranteed to be at mtod() we will
 	 * need to account for this */
 	 * The mbuf will however be thrown away so we can adjust it.
 	 * Remember we did an m_pullup on it already so we
 	 * can make some assumptions about contiguousness.
 	 */
 	if (args->L3offset)
 		m_adj(m, args->L3offset);
 #endif
 	if (code != ICMP_REJECT_RST && code != ICMP_REJECT_ABORT) {
 		/* Send an ICMP unreach */
 		icmp_error(args->m, ICMP_UNREACH, code, 0L, mtu);
 	} else if (code == ICMP_REJECT_RST && args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *const tcp =
 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
 		if ( (tcp->th_flags & TH_RST) == 0) {
 			struct mbuf *m;
 			m = ipfw_send_pkt(args->m, &(args->f_id),
 				ntohl(tcp->th_seq), ntohl(tcp->th_ack),
 				tcp->th_flags | TH_RST);
 			if (m != NULL)
 				ip_output(m, NULL, NULL, 0, NULL, NULL);
 		}
 		FREE_PKT(args->m);
 	} else if (code == ICMP_REJECT_ABORT &&
 	    args->f_id.proto == IPPROTO_SCTP) {
 		struct mbuf *m;
 		struct sctphdr *sctp;
 		struct sctp_chunkhdr *chunk;
 		struct sctp_init *init;
 		u_int32_t v_tag;
 		int reflected;
 
 		sctp = L3HDR(struct sctphdr, mtod(args->m, struct ip *));
 		reflected = 1;
 		v_tag = ntohl(sctp->v_tag);
 		if (iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) +
 		    sizeof(struct sctp_chunkhdr)) {
 			/* Look at the first chunk header if available */
 			chunk = (struct sctp_chunkhdr *)(sctp + 1);
 			switch (chunk->chunk_type) {
 			case SCTP_INITIATION:
 				/*
 				 * Packets containing an INIT chunk MUST have
 				 * a zero v-tag.
 				 */
 				if (v_tag != 0) {
 					v_tag = 0;
 					break;
 				}
 				/* INIT chunk MUST NOT be bundled */
 				if (iplen >
 				    (ip->ip_hl << 2) + sizeof(struct sctphdr) +
 				    ntohs(chunk->chunk_length) + 3) {
 					break;
 				}
 				/* Use the initiate tag if available */
 				if ((iplen >= (ip->ip_hl << 2) +
 				    sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))) {
 					init = (struct sctp_init *)(chunk + 1);
 					v_tag = ntohl(init->initiate_tag);
 					reflected = 0;
 				}
 				break;
 			case SCTP_ABORT_ASSOCIATION:
 				/*
 				 * If the packet contains an ABORT chunk, don't
 				 * reply.
 				 * XXX: We should search through all chunks,
 				 * but do not do that to avoid attacks.
 				 */
 				v_tag = 0;
 				break;
 			}
 		}
 		if (v_tag == 0) {
 			m = NULL;
 		} else {
 			m = ipfw_send_abort(args->m, &(args->f_id), v_tag,
 			    reflected);
 		}
 		if (m != NULL)
 			ip_output(m, NULL, NULL, 0, NULL, NULL);
 		FREE_PKT(args->m);
 	} else
 		FREE_PKT(args->m);
 	args->m = NULL;
 }
 
 /*
  * Support for uid/gid/jail lookup. These tests are expensive
  * (because we may need to look into the list of active sockets)
  * so we cache the results. ugid_lookupp is 0 if we have not
  * yet done a lookup, 1 if we succeeded, and -1 if we tried
  * and failed. The function always returns the match value.
  * We could actually spare the variable and use *uc, setting
  * it to '(void *)check_uidgid if we have no info, NULL if
  * we tried and failed, or any other value if successful.
  */
 static int
 check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp,
     struct ucred **uc)
 {
 #if defined(USERSPACE)
 	return 0;	// not supported in userspace
 #else
 #ifndef __FreeBSD__
 	/* XXX */
 	return cred_check(insn, proto, oif,
 	    dst_ip, dst_port, src_ip, src_port,
 	    (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
 #else  /* FreeBSD */
 	struct in_addr src_ip, dst_ip;
 	struct inpcbinfo *pi;
 	struct ipfw_flow_id *id;
 	struct inpcb *pcb, *inp;
 	int lookupflags;
 	int match;
 
 	id = &args->f_id;
 	inp = args->inp;
 
 	/*
 	 * Check to see if the UDP or TCP stack supplied us with
 	 * the PCB. If so, rather then holding a lock and looking
 	 * up the PCB, we can use the one that was supplied.
 	 */
 	if (inp && *ugid_lookupp == 0) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_socket != NULL) {
 			*uc = crhold(inp->inp_cred);
 			*ugid_lookupp = 1;
 		} else
 			*ugid_lookupp = -1;
 	}
 	/*
 	 * If we have already been here and the packet has no
 	 * PCB entry associated with it, then we can safely
 	 * assume that this is a no match.
 	 */
 	if (*ugid_lookupp == -1)
 		return (0);
 	if (id->proto == IPPROTO_TCP) {
 		lookupflags = 0;
 		pi = &V_tcbinfo;
 	} else if (id->proto == IPPROTO_UDP) {
 		lookupflags = INPLOOKUP_WILDCARD;
 		pi = &V_udbinfo;
 	} else if (id->proto == IPPROTO_UDPLITE) {
 		lookupflags = INPLOOKUP_WILDCARD;
 		pi = &V_ulitecbinfo;
 	} else
 		return 0;
 	lookupflags |= INPLOOKUP_RLOCKPCB;
 	match = 0;
 	if (*ugid_lookupp == 0) {
 		if (id->addr_type == 6) {
 #ifdef INET6
 			if (args->flags & IPFW_ARGS_IN)
 				pcb = in6_pcblookup_mbuf(pi,
 				    &id->src_ip6, htons(id->src_port),
 				    &id->dst_ip6, htons(id->dst_port),
 				    lookupflags, NULL, args->m);
 			else
 				pcb = in6_pcblookup_mbuf(pi,
 				    &id->dst_ip6, htons(id->dst_port),
 				    &id->src_ip6, htons(id->src_port),
 				    lookupflags, args->ifp, args->m);
 #else
 			*ugid_lookupp = -1;
 			return (0);
 #endif
 		} else {
 			src_ip.s_addr = htonl(id->src_ip);
 			dst_ip.s_addr = htonl(id->dst_ip);
 			if (args->flags & IPFW_ARGS_IN)
 				pcb = in_pcblookup_mbuf(pi,
 				    src_ip, htons(id->src_port),
 				    dst_ip, htons(id->dst_port),
 				    lookupflags, NULL, args->m);
 			else
 				pcb = in_pcblookup_mbuf(pi,
 				    dst_ip, htons(id->dst_port),
 				    src_ip, htons(id->src_port),
 				    lookupflags, args->ifp, args->m);
 		}
 		if (pcb != NULL) {
 			INP_RLOCK_ASSERT(pcb);
 			*uc = crhold(pcb->inp_cred);
 			*ugid_lookupp = 1;
 			INP_RUNLOCK(pcb);
 		}
 		if (*ugid_lookupp == 0) {
 			/*
 			 * We tried and failed, set the variable to -1
 			 * so we will not try again on this packet.
 			 */
 			*ugid_lookupp = -1;
 			return (0);
 		}
 	}
 	if (insn->o.opcode == O_UID)
 		match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
 	else if (insn->o.opcode == O_GID)
 		match = groupmember((gid_t)insn->d[0], *uc);
 	else if (insn->o.opcode == O_JAIL)
 		match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
 	return (match);
 #endif /* __FreeBSD__ */
 #endif /* not supported in userspace */
 }
 
 /*
  * Helper function to set args with info on the rule after the matching
  * one. slot is precise, whereas we guess rule_id as they are
  * assigned sequentially.
  */
 static inline void
 set_match(struct ip_fw_args *args, int slot,
 	struct ip_fw_chain *chain)
 {
 	args->rule.chain_id = chain->id;
 	args->rule.slot = slot + 1; /* we use 0 as a marker */
 	args->rule.rule_id = 1 + chain->map[slot]->id;
 	args->rule.rulenum = chain->map[slot]->rulenum;
 	args->flags |= IPFW_ARGS_REF;
 }
 
 static int
 jump_lookup_pos(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards)
 {
 	int f_pos, i;
 
 	i = IP_FW_ARG_TABLEARG(chain, num, skipto);
 	/* make sure we do not jump backward */
 	if (jump_backwards == 0 && i <= f->rulenum)
 		i = f->rulenum + 1;
 
 #ifndef LINEAR_SKIPTO
 	if (chain->idxmap != NULL)
 		f_pos = chain->idxmap[i];
 	else
 		f_pos = ipfw_find_rule(chain, i, 0);
 #else
 	f_pos = chain->idxmap[i];
 #endif /* LINEAR_SKIPTO */
 
 	return (f_pos);
 }
 
 
 #ifndef LINEAR_SKIPTO
 /*
  * Helper function to enable cached rule lookups using
  * cache.id and cache.pos fields in ipfw rule.
  */
 static int
 jump_cached(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards)
 {
 	int f_pos;
 
 	/* Can't use cache with IP_FW_TARG */
 	if (num == IP_FW_TARG)
 		return jump_lookup_pos(chain, f, num, tablearg, jump_backwards);
 
 	/*
 	 * If possible use cached f_pos (in f->cache.pos),
 	 * whose version is written in f->cache.id (horrible hacks
 	 * to avoid changing the ABI).
 	 *
 	 * Multiple threads can execute the same rule simultaneously,
 	 * we need to ensure that cache.pos is updated before cache.id.
 	 */
 
 #ifdef __LP64__
 	struct ip_fw_jump_cache cache;
 
 	cache.raw_value = f->cache.raw_value;
 	if (cache.id == chain->id)
 		return (cache.pos);
 
 	f_pos = jump_lookup_pos(chain, f, num, tablearg, jump_backwards);
 
 	cache.pos = f_pos;
 	cache.id = chain->id;
 	f->cache.raw_value = cache.raw_value;
 #else
 	if (f->cache.id == chain->id) {
 		/* Load pos after id */
 		atomic_thread_fence_acq();
 		return (f->cache.pos);
 	}
 
 	f_pos = jump_lookup_pos(chain, f, num, tablearg, jump_backwards);
 
 	f->cache.pos = f_pos;
 	/* Store id after pos */
 	atomic_thread_fence_rel();
 	f->cache.id = chain->id;
 #endif /* !__LP64__ */
 	return (f_pos);
 }
 #endif /* !LINEAR_SKIPTO */
 
 #define	TARG(k, f)	IP_FW_ARG_TABLEARG(chain, k, f)
 /*
  * The main check routine for the firewall.
  *
  * All arguments are in args so we can modify them and return them
  * back to the caller.
  *
  * Parameters:
  *
  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
  *		Starts with the IP header.
  *	args->L3offset	Number of bytes bypassed if we came from L2.
  *			e.g. often sizeof(eh)  ** NOTYET **
  *	args->ifp	Incoming or outgoing interface.
  *	args->divert_rule (in/out)
  *		Skip up to the first rule past this rule number;
  *		upon return, non-zero port number for divert or tee.
  *
  *	args->rule	Pointer to the last matching rule (in/out)
  *	args->next_hop	Socket we are forwarding to (out).
  *	args->next_hop6	IPv6 next hop we are forwarding to (out).
  *	args->f_id	Addresses grabbed from the packet (out)
  * 	args->rule.info	a cookie depending on rule action
  *
  * Return value:
  *
  *	IP_FW_PASS	the packet must be accepted
  *	IP_FW_DENY	the packet must be dropped
  *	IP_FW_DIVERT	divert packet, port in m_tag
  *	IP_FW_TEE	tee packet, port in m_tag
  *	IP_FW_DUMMYNET	to dummynet, pipe in args->cookie
  *	IP_FW_NETGRAPH	into netgraph, cookie args->cookie
  *		args->rule contains the matching rule,
  *		args->rule.info has additional information.
  *
  */
 int
 ipfw_chk(struct ip_fw_args *args)
 {
 
 	/*
 	 * Local variables holding state while processing a packet:
 	 *
 	 * IMPORTANT NOTE: to speed up the processing of rules, there
 	 * are some assumption on the values of the variables, which
 	 * are documented here. Should you change them, please check
 	 * the implementation of the various instructions to make sure
 	 * that they still work.
 	 *
 	 * m | args->m	Pointer to the mbuf, as received from the caller.
 	 *	It may change if ipfw_chk() does an m_pullup, or if it
 	 *	consumes the packet because it calls send_reject().
 	 *	XXX This has to change, so that ipfw_chk() never modifies
 	 *	or consumes the buffer.
 	 *	OR
 	 * args->mem	Pointer to contigous memory chunk.
 	 * ip	Is the beginning of the ip(4 or 6) header.
 	 * eh	Ethernet header in case if input is Layer2.
 	 */
 	struct mbuf *m;
 	struct ip *ip;
 	struct ether_header *eh;
 
 	/*
 	 * For rules which contain uid/gid or jail constraints, cache
 	 * a copy of the users credentials after the pcb lookup has been
 	 * executed. This will speed up the processing of rules with
 	 * these types of constraints, as well as decrease contention
 	 * on pcb related locks.
 	 */
 #ifndef __FreeBSD__
 	struct bsd_ucred ucred_cache;
 #else
 	struct ucred *ucred_cache = NULL;
 #endif
 	int ucred_lookup = 0;
 	int f_pos = 0;		/* index of current rule in the array */
 	int retval = 0;
 	struct ifnet *oif, *iif;
 
 	/*
 	 * hlen	The length of the IP header.
 	 */
 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
 
 	/*
 	 * offset	The offset of a fragment. offset != 0 means that
 	 *	we have a fragment at this offset of an IPv4 packet.
 	 *	offset == 0 means that (if this is an IPv4 packet)
 	 *	this is the first or only fragment.
 	 *	For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header
 	 *	or there is a single packet fragment (fragment header added
 	 *	without needed).  We will treat a single packet fragment as if
 	 *	there was no fragment header (or log/block depending on the
 	 *	V_fw_permit_single_frag6 sysctl setting).
 	 */
 	u_short offset = 0;
 	u_short ip6f_mf = 0;
 
 	/*
 	 * Local copies of addresses. They are only valid if we have
 	 * an IP packet.
 	 *
 	 * proto	The protocol. Set to 0 for non-ip packets,
 	 *	or to the protocol read from the packet otherwise.
 	 *	proto != 0 means that we have an IPv4 packet.
 	 *
 	 * src_port, dst_port	port numbers, in HOST format. Only
 	 *	valid for TCP and UDP packets.
 	 *
 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
 	 *	Only valid for IPv4 packets.
 	 */
 	uint8_t proto;
 	uint16_t src_port, dst_port;		/* NOTE: host format	*/
 	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
 	int iplen = 0;
 	int pktlen;
 
 	struct ipfw_dyn_info dyn_info;
 	struct ip_fw *q = NULL;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 
 	/*
 	 * We store in ulp a pointer to the upper layer protocol header.
 	 * In the ipv4 case this is easy to determine from the header,
 	 * but for ipv6 we might have some additional headers in the middle.
 	 * ulp is NULL if not found.
 	 */
 	void *ulp = NULL;		/* upper layer protocol pointer. */
 
 	/* XXX ipv6 variables */
 	int is_ipv6 = 0;
 #ifdef INET6
 	uint8_t	icmp6_type = 0;
 #endif
 	uint16_t ext_hd = 0;	/* bits vector for extension header filtering */
 	/* end of ipv6 variables */
 
 	int is_ipv4 = 0;
 
 	int done = 0;		/* flag to exit the outer loop */
 	IPFW_RLOCK_TRACKER;
 	bool mem;
 
 	if ((mem = (args->flags & IPFW_ARGS_LENMASK))) {
 		if (args->flags & IPFW_ARGS_ETHER) {
 			eh = (struct ether_header *)args->mem;
 			if (eh->ether_type == htons(ETHERTYPE_VLAN))
 				ip = (struct ip *)
 				    ((struct ether_vlan_header *)eh + 1);
 			else
 				ip = (struct ip *)(eh + 1);
 		} else {
 			eh = NULL;
 			ip = (struct ip *)args->mem;
 		}
 		pktlen = IPFW_ARGS_LENGTH(args->flags);
 		args->f_id.fib = args->ifp->if_fib;	/* best guess */
 	} else {
 		m = args->m;
 		if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
 			return (IP_FW_PASS);	/* accept */
 		if (args->flags & IPFW_ARGS_ETHER) {
 	                /* We need some amount of data to be contiguous. */
 			if (m->m_len < min(m->m_pkthdr.len, max_protohdr) &&
 			    (args->m = m = m_pullup(m, min(m->m_pkthdr.len,
 			    max_protohdr))) == NULL)
 				goto pullup_failed;
 			eh = mtod(m, struct ether_header *);
 			ip = (struct ip *)(eh + 1);
 		} else {
 			eh = NULL;
 			ip = mtod(m, struct ip *);
 		}
 		pktlen = m->m_pkthdr.len;
 		args->f_id.fib = M_GETFIB(m); /* mbuf not altered */
 	}
 
 	dst_ip.s_addr = 0;		/* make sure it is initialized */
 	src_ip.s_addr = 0;		/* make sure it is initialized */
 	src_port = dst_port = 0;
 
 	DYN_INFO_INIT(&dyn_info);
 /*
  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
  * pointer might become stale after other pullups (but we never use it
  * this way).
  */
 #define	PULLUP_TO(_len, p, T)	PULLUP_LEN(_len, p, sizeof(T))
 #define	EHLEN	(eh != NULL ? ((char *)ip - (char *)eh) : 0)
 #define	_PULLUP_LOCKED(_len, p, T, unlock)			\
 do {								\
 	int x = (_len) + T + EHLEN;				\
 	if (mem) {						\
 		if (__predict_false(pktlen < x)) {		\
 			unlock;					\
 			goto pullup_failed;			\
 		}						\
 		p = (char *)args->mem + (_len) + EHLEN;		\
 	} else {						\
 		if (__predict_false((m)->m_len < x)) {		\
 			args->m = m = m_pullup(m, x);		\
 			if (m == NULL) {			\
 				unlock;				\
 				goto pullup_failed;		\
 			}					\
 		}						\
 		p = mtod(m, char *) + (_len) + EHLEN;		\
 	}							\
 } while (0)
 
 #define	PULLUP_LEN(_len, p, T)	_PULLUP_LOCKED(_len, p, T, )
 #define	PULLUP_LEN_LOCKED(_len, p, T)	\
     _PULLUP_LOCKED(_len, p, T, IPFW_PF_RUNLOCK(chain));	\
     UPDATE_POINTERS()
 /*
  * In case pointers got stale after pullups, update them.
  */
 #define	UPDATE_POINTERS()					\
 do {								\
 	if (!mem) {						\
 		if (eh != NULL) {				\
 			eh = mtod(m, struct ether_header *);	\
 			ip = (struct ip *)(eh + 1);		\
 		} else						\
 			ip = mtod(m, struct ip *);		\
 		args->m = m;					\
 	}							\
 } while (0)
 
 	/* Identify IP packets and fill up variables. */
 	if (pktlen >= sizeof(struct ip6_hdr) &&
 	    (eh == NULL || eh->ether_type == htons(ETHERTYPE_IPV6)) &&
 	    ip->ip_v == 6) {
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
 
 		is_ipv6 = 1;
 		args->flags |= IPFW_ARGS_IP6;
 		hlen = sizeof(struct ip6_hdr);
 		proto = ip6->ip6_nxt;
 		/* Search extension headers to find upper layer protocols */
 		while (ulp == NULL && offset == 0) {
 			switch (proto) {
 			case IPPROTO_ICMPV6:
 				PULLUP_TO(hlen, ulp, struct icmp6_hdr);
 #ifdef INET6
 				icmp6_type = ICMP6(ulp)->icmp6_type;
 #endif
 				break;
 
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				/* save flags for dynamic rules */
 				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
 				if (pktlen >= hlen + sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr) +
 					    sizeof(struct sctp_chunkhdr) +
 					    offsetof(struct sctp_init, a_rwnd));
 				else if (pktlen >= hlen + sizeof(struct sctphdr))
 					PULLUP_LEN(hlen, ulp, pktlen - hlen);
 				else
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr));
 				src_port = SCTP(ulp)->src_port;
 				dst_port = SCTP(ulp)->dest_port;
 				break;
 
 			case IPPROTO_UDP:
 			case IPPROTO_UDPLITE:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_HOPOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_HOPOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ROUTING:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_rthdr);
 				switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
 				case 0:
 					ext_hd |= EXT_RTHDR0;
 					break;
 				case 2:
 					ext_hd |= EXT_RTHDR2;
 					break;
 				default:
 					if (V_fw_verbose)
 						printf("IPFW2: IPV6 - Unknown "
 						    "Routing Header type(%d)\n",
 						    ((struct ip6_rthdr *)
 						    ulp)->ip6r_type);
 					if (V_fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				ext_hd |= EXT_ROUTING;
 				hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
 				proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_FRAGMENT:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_frag);
 				ext_hd |= EXT_FRAGMENT;
 				hlen += sizeof (struct ip6_frag);
 				proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
 				offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_OFF_MASK;
 				ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_MORE_FRAG;
 				if (V_fw_permit_single_frag6 == 0 &&
 				    offset == 0 && ip6f_mf == 0) {
 					if (V_fw_verbose)
 						printf("IPFW2: IPV6 - Invalid "
 						    "Fragment Header\n");
 					if (V_fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				args->f_id.extra =
 				    ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
 				ulp = NULL;
 				break;
 
 			case IPPROTO_DSTOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_DSTOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_AH:	/* RFC 2402 */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				ext_hd |= EXT_AH;
 				hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
 				proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ESP:	/* RFC 2406 */
 				PULLUP_TO(hlen, ulp, uint32_t);	/* SPI, Seq# */
 				/* Anything past Seq# is variable length and
 				 * data past this ext. header is encrypted. */
 				ext_hd |= EXT_ESP;
 				break;
 
 			case IPPROTO_NONE:	/* RFC 2460 */
 				/*
 				 * Packet ends here, and IPv6 header has
 				 * already been pulled up. If ip6e_len!=0
 				 * then octets must be ignored.
 				 */
 				ulp = ip; /* non-NULL to get out of loop. */
 				break;
 
 			case IPPROTO_OSPFIGP:
 				/* XXX OSPF header check? */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 
 			case IPPROTO_PIM:
 				/* XXX PIM header check? */
 				PULLUP_TO(hlen, ulp, struct pim);
 				break;
 
 			case IPPROTO_GRE:	/* RFC 1701 */
 				/* XXX GRE header check? */
 				PULLUP_TO(hlen, ulp, struct grehdr);
 				break;
 
 			case IPPROTO_CARP:
 				PULLUP_TO(hlen, ulp, offsetof(
 				    struct carp_header, carp_counter));
 				if (CARP_ADVERTISEMENT !=
 				    ((struct carp_header *)ulp)->carp_type)
 					return (IP_FW_DENY);
 				break;
 
 			case IPPROTO_IPV6:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip6_hdr);
 				break;
 
 			case IPPROTO_IPV4:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip);
 				break;
 
 			default:
 				if (V_fw_verbose)
 					printf("IPFW2: IPV6 - Unknown "
 					    "Extension Header(%d), ext_hd=%x\n",
 					     proto, ext_hd);
 				if (V_fw_deny_unknown_exthdrs)
 				    return (IP_FW_DENY);
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 			} /*switch */
 		}
 		UPDATE_POINTERS();
 		ip6 = (struct ip6_hdr *)ip;
 		args->f_id.addr_type = 6;
 		args->f_id.src_ip6 = ip6->ip6_src;
 		args->f_id.dst_ip6 = ip6->ip6_dst;
 		args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
 		iplen = ntohs(ip6->ip6_plen) + sizeof(*ip6);
 	} else if (pktlen >= sizeof(struct ip) &&
 	    (eh == NULL || eh->ether_type == htons(ETHERTYPE_IP)) &&
 	    ip->ip_v == 4) {
 		is_ipv4 = 1;
 		args->flags |= IPFW_ARGS_IP4;
 		hlen = ip->ip_hl << 2;
 		/*
 		 * Collect parameters into local variables for faster
 		 * matching.
 		 */
 		proto = ip->ip_p;
 		src_ip = ip->ip_src;
 		dst_ip = ip->ip_dst;
 		offset = ntohs(ip->ip_off) & IP_OFFMASK;
 		iplen = ntohs(ip->ip_len);
 
 		if (offset == 0) {
 			switch (proto) {
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				/* save flags for dynamic rules */
 				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
 				if (pktlen >= hlen + sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr) +
 					    sizeof(struct sctp_chunkhdr) +
 					    offsetof(struct sctp_init, a_rwnd));
 				else if (pktlen >= hlen + sizeof(struct sctphdr))
 					PULLUP_LEN(hlen, ulp, pktlen - hlen);
 				else
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr));
 				src_port = SCTP(ulp)->src_port;
 				dst_port = SCTP(ulp)->dest_port;
 				break;
 
 			case IPPROTO_UDP:
 			case IPPROTO_UDPLITE:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_ICMP:
 				PULLUP_TO(hlen, ulp, struct icmphdr);
 				//args->f_id.flags = ICMP(ulp)->icmp_type;
 				break;
 
 			default:
 				break;
 			}
 		} else {
 			if (offset == 1 && proto == IPPROTO_TCP) {
 				/* RFC 3128 */
 				goto pullup_failed;
 			}
 		}
 
 		UPDATE_POINTERS();
 		args->f_id.addr_type = 4;
 		args->f_id.src_ip = ntohl(src_ip.s_addr);
 		args->f_id.dst_ip = ntohl(dst_ip.s_addr);
 	} else {
 		proto = 0;
 		dst_ip.s_addr = src_ip.s_addr = 0;
 
 		args->f_id.addr_type = 1; /* XXX */
 	}
 #undef PULLUP_TO
 	pktlen = iplen < pktlen ? iplen: pktlen;
 
 	/* Properly initialize the rest of f_id */
 	args->f_id.proto = proto;
 	args->f_id.src_port = src_port = ntohs(src_port);
 	args->f_id.dst_port = dst_port = ntohs(dst_port);
 
 	IPFW_PF_RLOCK(chain);
 	if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
 		IPFW_PF_RUNLOCK(chain);
 		return (IP_FW_PASS);	/* accept */
 	}
 	if (args->flags & IPFW_ARGS_REF) {
 		/*
 		 * Packet has already been tagged as a result of a previous
 		 * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
 		 * REASS, NETGRAPH, DIVERT/TEE...)
 		 * Validate the slot and continue from the next one
 		 * if still present, otherwise do a lookup.
 		 */
 		f_pos = (args->rule.chain_id == chain->id) ?
 		    args->rule.slot :
 		    ipfw_find_rule(chain, args->rule.rulenum,
 			args->rule.rule_id);
 	} else {
 		f_pos = 0;
 	}
 
 	if (args->flags & IPFW_ARGS_IN) {
 		iif = args->ifp;
 		oif = NULL;
 	} else {
 		MPASS(args->flags & IPFW_ARGS_OUT);
 		iif = mem ? NULL : m_rcvif(m);
 		oif = args->ifp;
 	}
 
 	/*
 	 * Now scan the rules, and parse microinstructions for each rule.
 	 * We have two nested loops and an inner switch. Sometimes we
 	 * need to break out of one or both loops, or re-enter one of
 	 * the loops with updated variables. Loop variables are:
 	 *
 	 *	f_pos (outer loop) points to the current rule.
 	 *		On output it points to the matching rule.
 	 *	done (outer loop) is used as a flag to break the loop.
 	 *	l (inner loop)	residual length of current rule.
 	 *		cmd points to the current microinstruction.
 	 *
 	 * We break the inner loop by setting l=0 and possibly
 	 * cmdlen=0 if we don't want to advance cmd.
 	 * We break the outer loop by setting done=1
 	 * We can restart the inner loop by setting l>0 and f_pos, f, cmd
 	 * as needed.
 	 */
 	for (; f_pos < chain->n_rules; f_pos++) {
 		ipfw_insn *cmd;
 		uint32_t tablearg = 0;
 		int l, cmdlen, skip_or; /* skip rest of OR block */
 		struct ip_fw *f;
 
 		f = chain->map[f_pos];
 		if (V_set_disable & (1 << f->set) )
 			continue;
 
 		skip_or = 0;
 		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
 		    l -= cmdlen, cmd += cmdlen) {
 			int match;
 
 			/*
 			 * check_body is a jump target used when we find a
 			 * CHECK_STATE, and need to jump to the body of
 			 * the target rule.
 			 */
 
 /* check_body: */
 			cmdlen = F_LEN(cmd);
 			/*
 			 * An OR block (insn_1 || .. || insn_n) has the
 			 * F_OR bit set in all but the last instruction.
 			 * The first match will set "skip_or", and cause
 			 * the following instructions to be skipped until
 			 * past the one with the F_OR bit clear.
 			 */
 			if (skip_or) {		/* skip this instruction */
 				if ((cmd->len & F_OR) == 0)
 					skip_or = 0;	/* next one is good */
 				continue;
 			}
 			match = 0; /* set to 1 if we succeed */
 
 			switch (cmd->opcode) {
 			/*
 			 * The first set of opcodes compares the packet's
 			 * fields with some pattern, setting 'match' if a
 			 * match is found. At the end of the loop there is
 			 * logic to deal with F_NOT and F_OR flags associated
 			 * with the opcode.
 			 */
 			case O_NOP:
 				match = 1;
 				break;
 
 			case O_FORWARD_MAC:
 				printf("ipfw: opcode %d unimplemented\n",
 				    cmd->opcode);
 				break;
 
 			case O_GID:
 			case O_UID:
 			case O_JAIL:
 				/*
 				 * We only check offset == 0 && proto != 0,
 				 * as this ensures that we have a
 				 * packet with the ports info.
 				 */
 				if (offset != 0)
 					break;
 				if (proto == IPPROTO_TCP ||
 				    proto == IPPROTO_UDP ||
 				    proto == IPPROTO_UDPLITE)
 					match = check_uidgid(
 						    (ipfw_insn_u32 *)cmd,
 						    args, &ucred_lookup,
 #ifdef __FreeBSD__
 						    &ucred_cache);
 #else
 						    (void *)&ucred_cache);
 #endif
 				break;
 
 			case O_RECV:
 				match = iface_match(iif, (ipfw_insn_if *)cmd,
 				    chain, &tablearg);
 				break;
 
 			case O_XMIT:
 				match = iface_match(oif, (ipfw_insn_if *)cmd,
 				    chain, &tablearg);
 				break;
 
 			case O_VIA:
 				match = iface_match(args->ifp,
 				    (ipfw_insn_if *)cmd, chain, &tablearg);
 				break;
 
 			case O_MACADDR2:
 				if (args->flags & IPFW_ARGS_ETHER) {
 					u_int32_t *want = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->addr;
 					u_int32_t *mask = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->mask;
 					u_int32_t *hdr = (u_int32_t *)eh;
 
 					match =
 					    ( want[0] == (hdr[0] & mask[0]) &&
 					      want[1] == (hdr[1] & mask[1]) &&
 					      want[2] == (hdr[2] & mask[2]) );
 				}
 				break;
 
 			case O_MAC_TYPE:
 				if (args->flags & IPFW_ARGS_ETHER) {
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match =
 						    (ntohs(eh->ether_type) >=
 						    p[0] &&
 						    ntohs(eh->ether_type) <=
 						    p[1]);
 				}
 				break;
 
 			case O_FRAG:
 				if (is_ipv4) {
 					/*
 					 * Since flags_match() works with
 					 * uint8_t we pack ip_off into 8 bits.
 					 * For this match offset is a boolean.
 					 */
 					match = flags_match(cmd,
 					    ((ntohs(ip->ip_off) & ~IP_OFFMASK)
 					    >> 8) | (offset != 0));
 				} else {
 					/*
 					 * Compatiblity: historically bare
 					 * "frag" would match IPv6 fragments.
 					 */
 					match = (cmd->arg1 == 0x1 &&
 					    (offset != 0));
 				}
 				break;
 
 			case O_IN:	/* "out" is "not in" */
 				match = (oif == NULL);
 				break;
 
 			case O_LAYER2:
 				match = (args->flags & IPFW_ARGS_ETHER);
 				break;
 
 			case O_DIVERTED:
 				if ((args->flags & IPFW_ARGS_REF) == 0)
 					break;
 				/*
 				 * For diverted packets, args->rule.info
 				 * contains the divert port (in host format)
 				 * reason and direction.
 				 */
 				match = ((args->rule.info & IPFW_IS_MASK) ==
 				    IPFW_IS_DIVERT) && (
 				    ((args->rule.info & IPFW_INFO_IN) ?
 					1: 2) & cmd->arg1);
 				break;
 
 			case O_PROTO:
 				/*
 				 * We do not allow an arg of 0 so the
 				 * check of "proto" only suffices.
 				 */
 				match = (proto == cmd->arg1);
 				break;
 
 			case O_IP_SRC:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    src_ip.s_addr);
 				break;
 
 			case O_IP_DST_LOOKUP:
 			{
 				if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
 					void *pkey;
 					uint32_t vidx, key;
 					uint16_t keylen = 0; /* zero if can't match the packet */
 
 					/* Determine lookup key type */
 					vidx = ((ipfw_insn_u32 *)cmd)->d[1];
 					switch (vidx) {
 					case LOOKUP_DST_IP:
 					case LOOKUP_SRC_IP:
 						/* Need IP frame */
 						if (is_ipv6 == 0 && is_ipv4 == 0)
 							break;
 						if (vidx == LOOKUP_DST_IP)
 							pkey = is_ipv6 ?
 								(void *)&args->f_id.dst_ip6:
 								(void *)&dst_ip;
 						else
 							pkey = is_ipv6 ?
 								(void *)&args->f_id.src_ip6:
 								(void *)&src_ip;
 						keylen = is_ipv6 ?
 							sizeof(struct in6_addr):
 							sizeof(in_addr_t);
 						break;
 					case LOOKUP_DST_PORT:
 					case LOOKUP_SRC_PORT:
 						/* Need IP frame */
 						if (is_ipv6 == 0 && is_ipv4 == 0)
 							break;
 						/* Skip fragments */
 						if (offset != 0)
 							break;
 						/* Skip proto without ports */
 						if (proto != IPPROTO_TCP &&
 							proto != IPPROTO_UDP &&
 							proto != IPPROTO_UDPLITE &&
 							proto != IPPROTO_SCTP)
 							break;
 						key = vidx == LOOKUP_DST_PORT ?
 							dst_port:
 							src_port;
 						pkey = &key;
 						keylen = sizeof(key);
 						break;
 					case LOOKUP_UID:
 					case LOOKUP_JAIL:
 						check_uidgid(
 						    (ipfw_insn_u32 *)cmd,
 						    args, &ucred_lookup,
 						    &ucred_cache);
 						key = vidx == LOOKUP_UID ?
 							ucred_cache->cr_uid:
 							ucred_cache->cr_prison->pr_id;
 						pkey = &key;
 						keylen = sizeof(key);
 						break;
 					case LOOKUP_DSCP:
 						/* Need IP frame */
 						if (is_ipv6 == 0 && is_ipv4 == 0)
 							break;
 						if (is_ipv6)
 							key = IPV6_DSCP(
 							    (struct ip6_hdr *)ip) >> 2;
 						else
 							key = ip->ip_tos >> 2;
 						pkey = &key;
 						keylen = sizeof(key);
 						break;
 					case LOOKUP_DST_MAC:
 					case LOOKUP_SRC_MAC:
 						/* Need ether frame */
 						if ((args->flags & IPFW_ARGS_ETHER) == 0)
 							break;
 						pkey = vidx == LOOKUP_DST_MAC ?
 							eh->ether_dhost:
 							eh->ether_shost;
 						keylen = ETHER_ADDR_LEN;
 						break;
 					}
 					if (keylen == 0)
 						break;
 					match = ipfw_lookup_table(chain,
 					    cmd->arg1, keylen, pkey, &vidx);
 					if (!match)
 						break;
 					tablearg = vidx;
 					break;
 				}
 				/* cmdlen =< F_INSN_SIZE(ipfw_insn_u32) */
 				/* FALLTHROUGH */
 			}
 			case O_IP_SRC_LOOKUP:
 			{
 				void *pkey;
 				uint32_t vidx;
 				uint16_t keylen;
 
 				if (is_ipv4) {
 					keylen = sizeof(in_addr_t);
 					if (cmd->opcode == O_IP_DST_LOOKUP)
 						pkey = &dst_ip;
 					else
 						pkey = &src_ip;
 				} else if (is_ipv6) {
 					keylen = sizeof(struct in6_addr);
 					if (cmd->opcode == O_IP_DST_LOOKUP)
 						pkey = &args->f_id.dst_ip6;
 					else
 						pkey = &args->f_id.src_ip6;
 				} else
 					break;
 				match = ipfw_lookup_table(chain, cmd->arg1,
 				    keylen, pkey, &vidx);
 				if (!match)
 					break;
 				if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) {
 					match = ((ipfw_insn_u32 *)cmd)->d[0] ==
 					    TARG_VAL(chain, vidx, tag);
 					if (!match)
 						break;
 				}
 				tablearg = vidx;
 				break;
 			}
 
 			case O_MAC_SRC_LOOKUP:
 			case O_MAC_DST_LOOKUP:
 			{
 				void *pkey;
 				uint32_t vidx;
 				uint16_t keylen = ETHER_ADDR_LEN;
 
 				/* Need ether frame */
 				if ((args->flags & IPFW_ARGS_ETHER) == 0)
 					break;
 
 				if (cmd->opcode == O_MAC_DST_LOOKUP)
 					pkey = eh->ether_dhost;
 				else
 					pkey = eh->ether_shost;
 
 				match = ipfw_lookup_table(chain, cmd->arg1,
 				    keylen, pkey, &vidx);
 				if (!match)
 					break;
 				if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) {
 					match = ((ipfw_insn_u32 *)cmd)->d[0] ==
 					    TARG_VAL(chain, vidx, tag);
 					if (!match)
 						break;
 				}
 				tablearg = vidx;
 				break;
 			}
 
 			case O_IP_FLOW_LOOKUP:
 				{
 					uint32_t v = 0;
 					match = ipfw_lookup_table(chain,
 					    cmd->arg1, 0, &args->f_id, &v);
 					if (!match)
 						break;
 					if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
 						match = ((ipfw_insn_u32 *)cmd)->d[0] ==
 						    TARG_VAL(chain, v, tag);
 					if (match)
 						tablearg = v;
 				}
 				break;
 			case O_IP_SRC_MASK:
 			case O_IP_DST_MASK:
 				if (is_ipv4) {
 				    uint32_t a =
 					(cmd->opcode == O_IP_DST_MASK) ?
 					    dst_ip.s_addr : src_ip.s_addr;
 				    uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
 				    int i = cmdlen-1;
 
 				    for (; !match && i>0; i-= 2, p+= 2)
 					match = (p[0] == (a & p[1]));
 				}
 				break;
 
 			case O_IP_SRC_ME:
 				if (is_ipv4) {
 					match = in_localip(src_ip);
 					break;
 				}
 #ifdef INET6
 				/* FALLTHROUGH */
 			case O_IP6_SRC_ME:
 				match = is_ipv6 &&
 				    ipfw_localip6(&args->f_id.src_ip6);
 #endif
 				break;
 
 			case O_IP_DST_SET:
 			case O_IP_SRC_SET:
 				if (is_ipv4) {
 					u_int32_t *d = (u_int32_t *)(cmd+1);
 					u_int32_t addr =
 					    cmd->opcode == O_IP_DST_SET ?
 						args->f_id.dst_ip :
 						args->f_id.src_ip;
 
 					    if (addr < d[0])
 						    break;
 					    addr -= d[0]; /* subtract base */
 					    match = (addr < cmd->arg1) &&
 						( d[ 1 + (addr>>5)] &
 						  (1<<(addr & 0x1f)) );
 				}
 				break;
 
 			case O_IP_DST:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    dst_ip.s_addr);
 				break;
 
 			case O_IP_DST_ME:
 				if (is_ipv4) {
 					match = in_localip(dst_ip);
 					break;
 				}
 #ifdef INET6
 				/* FALLTHROUGH */
 			case O_IP6_DST_ME:
 				match = is_ipv6 &&
 				    ipfw_localip6(&args->f_id.dst_ip6);
 #endif
 				break;
 
 			case O_IP_SRCPORT:
 			case O_IP_DSTPORT:
 				/*
 				 * offset == 0 && proto != 0 is enough
 				 * to guarantee that we have a
 				 * packet with port info.
 				 */
 				if ((proto == IPPROTO_UDP ||
 				    proto == IPPROTO_UDPLITE ||
 				    proto == IPPROTO_TCP ||
 				    proto == IPPROTO_SCTP) && offset == 0) {
 					u_int16_t x =
 					    (cmd->opcode == O_IP_SRCPORT) ?
 						src_port : dst_port ;
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match = (x>=p[0] && x<=p[1]);
 				}
 				break;
 
 			case O_ICMPTYPE:
 				match = (offset == 0 && proto==IPPROTO_ICMP &&
 				    icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
 				break;
 
 #ifdef INET6
 			case O_ICMP6TYPE:
 				match = is_ipv6 && offset == 0 &&
 				    proto==IPPROTO_ICMPV6 &&
 				    icmp6type_match(
 					ICMP6(ulp)->icmp6_type,
 					(ipfw_insn_u32 *)cmd);
 				break;
 #endif /* INET6 */
 
 			case O_IPOPT:
 				match = (is_ipv4 &&
 				    ipopts_match(ip, cmd) );
 				break;
 
 			case O_IPVER:
 				match = ((is_ipv4 || is_ipv6) &&
 				    cmd->arg1 == ip->ip_v);
 				break;
 
 			case O_IPID:
 			case O_IPTTL:
 				if (!is_ipv4)
 					break;
 			case O_IPLEN:
 				{	/* only for IP packets */
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    if (cmd->opcode == O_IPLEN)
 					x = iplen;
 				    else if (cmd->opcode == O_IPTTL)
 					x = ip->ip_ttl;
 				    else /* must be IPID */
 					x = ntohs(ip->ip_id);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_IPPRECEDENCE:
 				match = (is_ipv4 &&
 				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
 				break;
 
 			case O_IPTOS:
 				match = (is_ipv4 &&
 				    flags_match(cmd, ip->ip_tos));
 				break;
 
 			case O_DSCP:
 			    {
 				uint32_t *p;
 				uint16_t x;
 
 				p = ((ipfw_insn_u32 *)cmd)->d;
 
 				if (is_ipv4)
 					x = ip->ip_tos >> 2;
 				else if (is_ipv6) {
 					x = IPV6_DSCP(
 					    (struct ip6_hdr *)ip) >> 2;
 					x &= 0x3f;
 				} else
 					break;
 
 				/* DSCP bitmask is stored as low_u32 high_u32 */
 				if (x >= 32)
 					match = *(p + 1) & (1 << (x - 32));
 				else
 					match = *p & (1 << x);
 			    }
 				break;
 
 			case O_TCPDATALEN:
 				if (proto == IPPROTO_TCP && offset == 0) {
 				    struct tcphdr *tcp;
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 #ifdef INET6
 				    if (is_ipv6) {
 					    struct ip6_hdr *ip6;
 
 					    ip6 = (struct ip6_hdr *)ip;
 					    if (ip6->ip6_plen == 0) {
 						    /*
 						     * Jumbo payload is not
 						     * supported by this
 						     * opcode.
 						     */
 						    break;
 					    }
 					    x = iplen - hlen;
 				    } else
 #endif /* INET6 */
 					    x = iplen - (ip->ip_hl << 2);
 				    tcp = TCP(ulp);
 				    x -= tcp->th_off << 2;
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_TCPFLAGS:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    flags_match(cmd, TCP(ulp)->th_flags));
 				break;
 
 			case O_TCPOPTS:
 				if (proto == IPPROTO_TCP && offset == 0 && ulp){
 					PULLUP_LEN_LOCKED(hlen, ulp,
 					    (TCP(ulp)->th_off << 2));
 					match = tcpopts_match(TCP(ulp), cmd);
 				}
 				break;
 
 			case O_TCPSEQ:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_seq);
 				break;
 
 			case O_TCPACK:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_ack);
 				break;
 
 			case O_TCPMSS:
 				if (proto == IPPROTO_TCP &&
 				    (args->f_id._flags & TH_SYN) != 0 &&
 				    ulp != NULL) {
 					uint16_t mss, *p;
 					int i;
 
 					PULLUP_LEN_LOCKED(hlen, ulp,
 					    (TCP(ulp)->th_off << 2));
 					if ((tcpopts_parse(TCP(ulp), &mss) &
 					    IP_FW_TCPOPT_MSS) == 0)
 						break;
 					if (cmdlen == 1) {
 						match = (cmd->arg1 == mss);
 						break;
 					}
 					/* Otherwise we have ranges. */
 					p = ((ipfw_insn_u16 *)cmd)->ports;
 					i = cmdlen - 1;
 					for (; !match && i > 0; i--, p += 2)
 						match = (mss >= p[0] &&
 						    mss <= p[1]);
 				}
 				break;
 
 			case O_TCPWIN:
 				if (proto == IPPROTO_TCP && offset == 0) {
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    x = ntohs(TCP(ulp)->th_win);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* Otherwise we have ranges. */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i > 0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_ESTAB:
 				/* reject packets which have SYN only */
 				/* XXX should i also check for TH_ACK ? */
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    (TCP(ulp)->th_flags &
 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
 				break;
 
 			case O_ALTQ: {
 				struct pf_mtag *at;
 				struct m_tag *mtag;
 				ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
 
 				/*
 				 * ALTQ uses mbuf tags from another
 				 * packet filtering system - pf(4).
 				 * We allocate a tag in its format
 				 * and fill it in, pretending to be pf(4).
 				 */
 				match = 1;
 				at = pf_find_mtag(m);
 				if (at != NULL && at->qid != 0)
 					break;
 				mtag = m_tag_get(PACKET_TAG_PF,
 				    sizeof(struct pf_mtag), M_NOWAIT | M_ZERO);
 				if (mtag == NULL) {
 					/*
 					 * Let the packet fall back to the
 					 * default ALTQ.
 					 */
 					break;
 				}
 				m_tag_prepend(m, mtag);
 				at = (struct pf_mtag *)(mtag + 1);
 				at->qid = altq->qid;
 				at->hdr = ip;
 				break;
 			}
 
 			case O_LOG:
 				ipfw_log(chain, f, hlen, args,
 				    offset | ip6f_mf, tablearg, ip);
 				match = 1;
 				break;
 
 			case O_PROB:
 				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
 				break;
 
 			case O_VERREVPATH:
 				/* Outgoing packets automatically pass/match */
 				match = (args->flags & IPFW_ARGS_OUT ||
 				    (
 #ifdef INET6
 				    is_ipv6 ?
 					verify_path6(&(args->f_id.src_ip6),
 					    iif, args->f_id.fib) :
 #endif
 				    verify_path(src_ip, iif, args->f_id.fib)));
 				break;
 
 			case O_VERSRCREACH:
 				/* Outgoing packets automatically pass/match */
 				match = (hlen > 0 && ((oif != NULL) || (
 #ifdef INET6
 				    is_ipv6 ?
 				        verify_path6(&(args->f_id.src_ip6),
 				            NULL, args->f_id.fib) :
 #endif
 				    verify_path(src_ip, NULL, args->f_id.fib))));
 				break;
 
 			case O_ANTISPOOF:
 				/* Outgoing packets automatically pass/match */
 				if (oif == NULL && hlen > 0 &&
 				    (  (is_ipv4 && in_localaddr(src_ip))
 #ifdef INET6
 				    || (is_ipv6 &&
 				        in6_localaddr(&(args->f_id.src_ip6)))
 #endif
 				    ))
 					match =
 #ifdef INET6
 					    is_ipv6 ? verify_path6(
 					        &(args->f_id.src_ip6), iif,
 						args->f_id.fib) :
 #endif
 					    verify_path(src_ip, iif,
 					        args->f_id.fib);
 				else
 					match = 1;
 				break;
 
 			case O_IPSEC:
 				match = (m_tag_find(m,
 				    PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
 				/* otherwise no match */
 				break;
 
 #ifdef INET6
 			case O_IP6_SRC:
 				match = is_ipv6 &&
 				    IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 
 			case O_IP6_DST:
 				match = is_ipv6 &&
 				IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 			case O_IP6_SRC_MASK:
 			case O_IP6_DST_MASK:
 				if (is_ipv6) {
 					int i = cmdlen - 1;
 					struct in6_addr p;
 					struct in6_addr *d =
 					    &((ipfw_insn_ip6 *)cmd)->addr6;
 
 					for (; !match && i > 0; d += 2,
 					    i -= F_INSN_SIZE(struct in6_addr)
 					    * 2) {
 						p = (cmd->opcode ==
 						    O_IP6_SRC_MASK) ?
 						    args->f_id.src_ip6:
 						    args->f_id.dst_ip6;
 						APPLY_MASK(&p, &d[1]);
 						match =
 						    IN6_ARE_ADDR_EQUAL(&d[0],
 						    &p);
 					}
 				}
 				break;
 
 			case O_FLOW6ID:
 				match = is_ipv6 &&
 				    flow6id_match(args->f_id.flow_id6,
 				    (ipfw_insn_u32 *) cmd);
 				break;
 
 			case O_EXT_HDR:
 				match = is_ipv6 &&
 				    (ext_hd & ((ipfw_insn *) cmd)->arg1);
 				break;
 
 			case O_IP6:
 				match = is_ipv6;
 				break;
 #endif
 
 			case O_IP4:
 				match = is_ipv4;
 				break;
 
 			case O_TAG: {
 				struct m_tag *mtag;
 				uint32_t tag = TARG(cmd->arg1, tag);
 
 				/* Packet is already tagged with this tag? */
 				mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
 
 				/* We have `untag' action when F_NOT flag is
 				 * present. And we must remove this mtag from
 				 * mbuf and reset `match' to zero (`match' will
 				 * be inversed later).
 				 * Otherwise we should allocate new mtag and
 				 * push it into mbuf.
 				 */
 				if (cmd->len & F_NOT) { /* `untag' action */
 					if (mtag != NULL)
 						m_tag_delete(m, mtag);
 					match = 0;
 				} else {
 					if (mtag == NULL) {
 						mtag = m_tag_alloc( MTAG_IPFW,
 						    tag, 0, M_NOWAIT);
 						if (mtag != NULL)
 							m_tag_prepend(m, mtag);
 					}
 					match = 1;
 				}
 				break;
 			}
 
 			case O_FIB: /* try match the specified fib */
 				if (args->f_id.fib == cmd->arg1)
 					match = 1;
 				break;
 
 			case O_SOCKARG:	{
 #ifndef USERSPACE	/* not supported in userspace */
 				struct inpcb *inp = args->inp;
 				struct inpcbinfo *pi;
 				bool inp_locked = false;
 
 				if (proto == IPPROTO_TCP)
 					pi = &V_tcbinfo;
 				else if (proto == IPPROTO_UDP)
 					pi = &V_udbinfo;
 				else if (proto == IPPROTO_UDPLITE)
 					pi = &V_ulitecbinfo;
 				else
 					break;
 
 				/*
 				 * XXXRW: so_user_cookie should almost
 				 * certainly be inp_user_cookie?
 				 */
 
 				/*
 				 * For incoming packet lookup the inpcb
 				 * using the src/dest ip/port tuple.
 				 */
 				if (is_ipv4 && inp == NULL) {
 					inp = in_pcblookup(pi,
 					    src_ip, htons(src_port),
 					    dst_ip, htons(dst_port),
 					    INPLOOKUP_RLOCKPCB, NULL);
 					inp_locked = true;
 				}
 #ifdef INET6
 				if (is_ipv6 && inp == NULL) {
 					inp = in6_pcblookup(pi,
 					    &args->f_id.src_ip6,
 					    htons(src_port),
 					    &args->f_id.dst_ip6,
 					    htons(dst_port),
 					    INPLOOKUP_RLOCKPCB, NULL);
 					inp_locked = true;
 				}
 #endif /* INET6 */
 				if (inp != NULL) {
 					if (inp->inp_socket) {
 						tablearg =
 						    inp->inp_socket->so_user_cookie;
 						if (tablearg)
 							match = 1;
 					}
 					if (inp_locked)
 						INP_RUNLOCK(inp);
 				}
 #endif /* !USERSPACE */
 				break;
 			}
 
 			case O_TAGGED: {
 				struct m_tag *mtag;
 				uint32_t tag = TARG(cmd->arg1, tag);
 
 				if (cmdlen == 1) {
 					match = m_tag_locate(m, MTAG_IPFW,
 					    tag, NULL) != NULL;
 					break;
 				}
 
 				/* we have ranges */
 				for (mtag = m_tag_first(m);
 				    mtag != NULL && !match;
 				    mtag = m_tag_next(m, mtag)) {
 					uint16_t *p;
 					int i;
 
 					if (mtag->m_tag_cookie != MTAG_IPFW)
 						continue;
 
 					p = ((ipfw_insn_u16 *)cmd)->ports;
 					i = cmdlen - 1;
 					for(; !match && i > 0; i--, p += 2)
 						match =
 						    mtag->m_tag_id >= p[0] &&
 						    mtag->m_tag_id <= p[1];
 				}
 				break;
 			}
 				
 			/*
 			 * The second set of opcodes represents 'actions',
 			 * i.e. the terminal part of a rule once the packet
 			 * matches all previous patterns.
 			 * Typically there is only one action for each rule,
 			 * and the opcode is stored at the end of the rule
 			 * (but there are exceptions -- see below).
 			 *
 			 * In general, here we set retval and terminate the
 			 * outer loop (would be a 'break 3' in some language,
 			 * but we need to set l=0, done=1)
 			 *
 			 * Exceptions:
 			 * O_COUNT and O_SKIPTO actions:
 			 *   instead of terminating, we jump to the next rule
 			 *   (setting l=0), or to the SKIPTO target (setting
 			 *   f/f_len, cmd and l as needed), respectively.
 			 *
 			 * O_TAG, O_LOG and O_ALTQ action parameters:
 			 *   perform some action and set match = 1;
 			 *
 			 * O_LIMIT and O_KEEP_STATE: these opcodes are
 			 *   not real 'actions', and are stored right
 			 *   before the 'action' part of the rule (one
 			 *   exception is O_SKIP_ACTION which could be
 			 *   between these opcodes and 'action' one).
 			 *   These opcodes try to install an entry in the
 			 *   state tables; if successful, we continue with
 			 *   the next opcode (match=1; break;), otherwise
 			 *   the packet must be dropped (set retval,
 			 *   break loops with l=0, done=1)
 			 *
 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
 			 *   cause a lookup of the state table, and a jump
 			 *   to the 'action' part of the parent rule
 			 *   if an entry is found, or
 			 *   (CHECK_STATE only) a jump to the next rule if
 			 *   the entry is not found.
 			 *   The result of the lookup is cached so that
 			 *   further instances of these opcodes become NOPs.
 			 *   The jump to the next rule is done by setting
 			 *   l=0, cmdlen=0.
 			 *
 			 * O_SKIP_ACTION: this opcode is not a real 'action'
 			 *  either, and is stored right before the 'action'
 			 *  part of the rule, right after the O_KEEP_STATE
 			 *  opcode. It causes match failure so the real
 			 *  'action' could be executed only if the rule
 			 *  is checked via dynamic rule from the state
 			 *  table, as in such case execution starts
 			 *  from the true 'action' opcode directly.
 			 *   
 			 */
 			case O_LIMIT:
 			case O_KEEP_STATE:
 				if (ipfw_dyn_install_state(chain, f,
 				    (ipfw_insn_limit *)cmd, args, ulp,
 				    pktlen, &dyn_info, tablearg)) {
 					/* error or limit violation */
 					retval = IP_FW_DENY;
 					l = 0;	/* exit inner loop */
 					done = 1; /* exit outer loop */
 				}
 				match = 1;
 				break;
 
 			case O_PROBE_STATE:
 			case O_CHECK_STATE:
 				/*
 				 * dynamic rules are checked at the first
 				 * keep-state or check-state occurrence,
 				 * with the result being stored in dyn_info.
 				 * The compiler introduces a PROBE_STATE
 				 * instruction for us when we have a
 				 * KEEP_STATE (because PROBE_STATE needs
 				 * to be run first).
 				 */
 				if (DYN_LOOKUP_NEEDED(&dyn_info, cmd) &&
 				    (q = ipfw_dyn_lookup_state(args, ulp,
 				    pktlen, cmd, &dyn_info)) != NULL) {
 					/*
 					 * Found dynamic entry, jump to the
 					 * 'action' part of the parent rule
 					 * by setting f, cmd, l and clearing
 					 * cmdlen.
 					 */
 					f = q;
 					f_pos = dyn_info.f_pos;
 					cmd = ACTION_PTR(f);
 					l = f->cmd_len - f->act_ofs;
 					cmdlen = 0;
 					match = 1;
 					break;
 				}
 				/*
 				 * Dynamic entry not found. If CHECK_STATE,
 				 * skip to next rule, if PROBE_STATE just
 				 * ignore and continue with next opcode.
 				 */
 				if (cmd->opcode == O_CHECK_STATE)
 					l = 0;	/* exit inner loop */
 				match = 1;
 				break;
 
 			case O_SKIP_ACTION:
 				match = 0;	/* skip to the next rule */
 				l = 0;		/* exit inner loop */
 				break;
 
 			case O_ACCEPT:
 				retval = 0;	/* accept */
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 
 			case O_PIPE:
 			case O_QUEUE:
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, pipe);
 				if (cmd->opcode == O_PIPE)
 					args->rule.info |= IPFW_IS_PIPE;
 				if (V_fw_one_pass)
 					args->rule.info |= IPFW_ONEPASS;
 				retval = IP_FW_DUMMYNET;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 			case O_DIVERT:
 			case O_TEE:
 				if (args->flags & IPFW_ARGS_ETHER)
 					break;	/* not on layer 2 */
 				/* otherwise this is terminal */
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				retval = (cmd->opcode == O_DIVERT) ?
 					IP_FW_DIVERT : IP_FW_TEE;
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, divert);
 				break;
 
 			case O_COUNT:
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				l = 0;		/* exit inner loop */
 				break;
 
 			case O_SKIPTO:
 			    IPFW_INC_RULE_COUNTER(f, pktlen);
 			    f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0);
 			    /*
 			     * Skip disabled rules, and re-enter
 			     * the inner loop with the correct
 			     * f_pos, f, l and cmd.
 			     * Also clear cmdlen and skip_or
 			     */
 			    for (; f_pos < chain->n_rules - 1 &&
 				    (V_set_disable &
 				     (1 << chain->map[f_pos]->set));
 				    f_pos++)
 				;
 			    /* Re-enter the inner loop at the skipto rule. */
 			    f = chain->map[f_pos];
 			    l = f->cmd_len;
 			    cmd = f->cmd;
 			    match = 1;
 			    cmdlen = 0;
 			    skip_or = 0;
 			    continue;
 			    break;	/* not reached */
 
 			case O_CALLRETURN: {
 				/*
 				 * Implementation of `subroutine' call/return,
 				 * in the stack carried in an mbuf tag. This
 				 * is different from `skipto' in that any call
 				 * address is possible (`skipto' must prevent
 				 * backward jumps to avoid endless loops).
 				 * We have `return' action when F_NOT flag is
 				 * present. The `m_tag_id' field is used as
 				 * stack pointer.
 				 */
 				struct m_tag *mtag;
 				uint16_t jmpto, *stack;
 
 #define	IS_CALL		((cmd->len & F_NOT) == 0)
 #define	IS_RETURN	((cmd->len & F_NOT) != 0)
 				/*
 				 * Hand-rolled version of m_tag_locate() with
 				 * wildcard `type'.
 				 * If not already tagged, allocate new tag.
 				 */
 				mtag = m_tag_first(m);
 				while (mtag != NULL) {
 					if (mtag->m_tag_cookie ==
 					    MTAG_IPFW_CALL)
 						break;
 					mtag = m_tag_next(m, mtag);
 				}
 				if (mtag == NULL && IS_CALL) {
 					mtag = m_tag_alloc(MTAG_IPFW_CALL, 0,
 					    IPFW_CALLSTACK_SIZE *
 					    sizeof(uint16_t), M_NOWAIT);
 					if (mtag != NULL)
 						m_tag_prepend(m, mtag);
 				}
 
 				/*
 				 * On error both `call' and `return' just
 				 * continue with next rule.
 				 */
 				if (IS_RETURN && (mtag == NULL ||
 				    mtag->m_tag_id == 0)) {
 					l = 0;		/* exit inner loop */
 					break;
 				}
 				if (IS_CALL && (mtag == NULL ||
 				    mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) {
 					printf("ipfw: call stack error, "
 					    "go to next rule\n");
 					l = 0;		/* exit inner loop */
 					break;
 				}
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				stack = (uint16_t *)(mtag + 1);
 
 				/*
 				 * The `call' action may use cached f_pos
 				 * (in f->next_rule), whose version is written
 				 * in f->next_rule.
 				 * The `return' action, however, doesn't have
 				 * fixed jump address in cmd->arg1 and can't use
 				 * cache.
 				 */
 				if (IS_CALL) {
 					stack[mtag->m_tag_id] = f->rulenum;
 					mtag->m_tag_id++;
 			    		f_pos = JUMP(chain, f, cmd->arg1,
 					    tablearg, 1);
 				} else {	/* `return' action */
 					mtag->m_tag_id--;
 					jmpto = stack[mtag->m_tag_id] + 1;
 					f_pos = ipfw_find_rule(chain, jmpto, 0);
 				}
 
 				/*
 				 * Skip disabled rules, and re-enter
 				 * the inner loop with the correct
 				 * f_pos, f, l and cmd.
 				 * Also clear cmdlen and skip_or
 				 */
 				for (; f_pos < chain->n_rules - 1 &&
 				    (V_set_disable &
 				    (1 << chain->map[f_pos]->set)); f_pos++)
 					;
 				/* Re-enter the inner loop at the dest rule. */
 				f = chain->map[f_pos];
 				l = f->cmd_len;
 				cmd = f->cmd;
 				cmdlen = 0;
 				skip_or = 0;
 				continue;
 				break;	/* NOTREACHED */
 			}
 #undef IS_CALL
 #undef IS_RETURN
 
 			case O_REJECT:
 				/*
 				 * Drop the packet and send a reject notice
 				 * if the packet is not ICMP (or is an ICMP
 				 * query), and it is not multicast/broadcast.
 				 */
 				if (hlen > 0 && is_ipv4 && offset == 0 &&
 				    (proto != IPPROTO_ICMP ||
 				     is_icmp_query(ICMP(ulp))) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
 					send_reject(args, cmd, iplen, ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #ifdef INET6
 			case O_UNREACH6:
 				if (hlen > 0 && is_ipv6 &&
 				    ((offset & IP6F_OFF_MASK) == 0) &&
 				    (proto != IPPROTO_ICMPV6 ||
 				     (is_icmp6_query(icmp6_type) == 1)) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN6_IS_ADDR_MULTICAST(
 					&args->f_id.dst_ip6)) {
 					send_reject6(args,
 					    cmd->opcode == O_REJECT ?
 					    map_icmp_unreach(cmd->arg1):
 					    cmd->arg1, hlen,
 					    (struct ip6_hdr *)ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #endif
 			case O_DENY:
 				retval = IP_FW_DENY;
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 
 			case O_FORWARD_IP:
 				if (args->flags & IPFW_ARGS_ETHER)
 					break;	/* not valid on layer2 pkts */
 				if (q != f ||
 				    dyn_info.direction == MATCH_FORWARD) {
 				    struct sockaddr_in *sa;
 
 				    sa = &(((ipfw_insn_sa *)cmd)->sa);
 				    if (sa->sin_addr.s_addr == INADDR_ANY) {
 #ifdef INET6
 					/*
 					 * We use O_FORWARD_IP opcode for
 					 * fwd rule with tablearg, but tables
 					 * now support IPv6 addresses. And
 					 * when we are inspecting IPv6 packet,
 					 * we can use nh6 field from
 					 * table_value as next_hop6 address.
 					 */
 					if (is_ipv6) {
 						struct ip_fw_nh6 *nh6;
 
 						args->flags |= IPFW_ARGS_NH6;
 						nh6 = &args->hopstore6;
 						nh6->sin6_addr = TARG_VAL(
 						    chain, tablearg, nh6);
 						nh6->sin6_port = sa->sin_port;
 						nh6->sin6_scope_id = TARG_VAL(
 						    chain, tablearg, zoneid);
 					} else
 #endif
 					{
 						args->flags |= IPFW_ARGS_NH4;
 						args->hopstore.sin_port =
 						    sa->sin_port;
 						sa = &args->hopstore;
 						sa->sin_family = AF_INET;
 						sa->sin_len = sizeof(*sa);
 						sa->sin_addr.s_addr = htonl(
 						    TARG_VAL(chain, tablearg,
 						    nh4));
 					}
 				    } else {
 					    args->flags |= IPFW_ARGS_NH4PTR;
 					    args->next_hop = sa;
 				    }
 				}
 				retval = IP_FW_PASS;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 #ifdef INET6
 			case O_FORWARD_IP6:
 				if (args->flags & IPFW_ARGS_ETHER)
 					break;	/* not valid on layer2 pkts */
 				if (q != f ||
 				    dyn_info.direction == MATCH_FORWARD) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = &(((ipfw_insn_sa6 *)cmd)->sa);
 					args->flags |= IPFW_ARGS_NH6PTR;
 					args->next_hop6 = sin6;
 				}
 				retval = IP_FW_PASS;
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 #endif
 
 			case O_NETGRAPH:
 			case O_NGTEE:
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, netgraph);
 				if (V_fw_one_pass)
 					args->rule.info |= IPFW_ONEPASS;
 				retval = (cmd->opcode == O_NETGRAPH) ?
 				    IP_FW_NETGRAPH : IP_FW_NGTEE;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 			case O_SETFIB: {
 				uint32_t fib;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				fib = TARG(cmd->arg1, fib) & 0x7FFF;
 				if (fib >= rt_numfibs)
 					fib = 0;
 				M_SETFIB(m, fib);
 				args->f_id.fib = fib; /* XXX */
 				l = 0;		/* exit inner loop */
 				break;
 		        }
 
 			case O_SETDSCP: {
 				uint16_t code;
 
 				code = TARG(cmd->arg1, dscp) & 0x3F;
 				l = 0;		/* exit inner loop */
 				if (is_ipv4) {
 					uint16_t old;
 
 					old = *(uint16_t *)ip;
 					ip->ip_tos = (code << 2) |
 					    (ip->ip_tos & 0x03);
 					ip->ip_sum = cksum_adjust(ip->ip_sum,
 					    old, *(uint16_t *)ip);
 				} else if (is_ipv6) {
 					/* update cached value */
 					args->f_id.flow_id6 =
 					    ntohl(*(uint32_t *)ip) & ~0x0FC00000;
 					args->f_id.flow_id6 |= code << 22;
 
 					*((uint32_t *)ip) =
 					    htonl(args->f_id.flow_id6);
 				} else
 					break;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				break;
 			}
 
 			case O_NAT:
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				/*
 				 * Ensure that we do not invoke NAT handler for
 				 * non IPv4 packets. Libalias expects only IPv4.
 				 */
 				if (!is_ipv4 || !IPFW_NAT_LOADED) {
 				    retval = IP_FW_DENY;
 				    break;
 				}
 
 				struct cfg_nat *t;
 				int nat_id;
 
 				args->rule.info = 0;
 				set_match(args, f_pos, chain);
 				/* Check if this is 'global' nat rule */
 				if (cmd->arg1 == IP_FW_NAT44_GLOBAL) {
 					retval = ipfw_nat_ptr(args, NULL, m);
 					break;
 				}
 				t = ((ipfw_insn_nat *)cmd)->nat;
 				if (t == NULL) {
 					nat_id = TARG(cmd->arg1, nat);
 					t = (*lookup_nat_ptr)(&chain->nat, nat_id);
 
 					if (t == NULL) {
 					    retval = IP_FW_DENY;
 					    break;
 					}
 					if (cmd->arg1 != IP_FW_TARG)
 					    ((ipfw_insn_nat *)cmd)->nat = t;
 				}
 				retval = ipfw_nat_ptr(args, t, m);
 				break;
 
 			case O_REASS: {
 				int ip_off;
 
 				l = 0;	/* in any case exit inner loop */
 				if (is_ipv6) /* IPv6 is not supported yet */
 					break;
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				ip_off = ntohs(ip->ip_off);
 
 				/* if not fragmented, go to next rule */
 				if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
 				    break;
 
 				args->m = m = ip_reass(m);
 
 				/*
 				 * do IP header checksum fixup.
 				 */
 				if (m == NULL) { /* fragment got swallowed */
 				    retval = IP_FW_DENY;
 				} else { /* good, packet complete */
 				    int hlen;
 
 				    ip = mtod(m, struct ip *);
 				    hlen = ip->ip_hl << 2;
 				    ip->ip_sum = 0;
 				    if (hlen == sizeof(struct ip))
 					ip->ip_sum = in_cksum_hdr(ip);
 				    else
 					ip->ip_sum = in_cksum(m, hlen);
 				    retval = IP_FW_REASS;
 				    args->rule.info = 0;
 				    set_match(args, f_pos, chain);
 				}
 				done = 1;	/* exit outer loop */
 				break;
 			}
 			case O_EXTERNAL_ACTION:
 				l = 0; /* in any case exit inner loop */
 				retval = ipfw_run_eaction(chain, args,
 				    cmd, &done);
 				/*
 				 * If both @retval and @done are zero,
 				 * consider this as rule matching and
 				 * update counters.
 				 */
 				if (retval == 0 && done == 0) {
 					IPFW_INC_RULE_COUNTER(f, pktlen);
 					/*
 					 * Reset the result of the last
 					 * dynamic state lookup.
 					 * External action can change
 					 * @args content, and it may be
 					 * used for new state lookup later.
 					 */
 					DYN_INFO_INIT(&dyn_info);
 				}
 				break;
 
 			default:
 				panic("-- unknown opcode %d\n", cmd->opcode);
 			} /* end of switch() on opcodes */
 			/*
 			 * if we get here with l=0, then match is irrelevant.
 			 */
 
 			if (cmd->len & F_NOT)
 				match = !match;
 
 			if (match) {
 				if (cmd->len & F_OR)
 					skip_or = 1;
 			} else {
 				if (!(cmd->len & F_OR)) /* not an OR block, */
 					break;		/* try next rule    */
 			}
 
 		}	/* end of inner loop, scan opcodes */
 #undef PULLUP_LEN
 #undef PULLUP_LEN_LOCKED
 
 		if (done)
 			break;
 
 /* next_rule:; */	/* try next rule		*/
 
 	}		/* end of outer for, scan rules */
 
 	if (done) {
 		struct ip_fw *rule = chain->map[f_pos];
 		/* Update statistics */
 		IPFW_INC_RULE_COUNTER(rule, pktlen);
 		IPFW_PROBE(rule__matched, retval,
 		    is_ipv4 ? AF_INET : AF_INET6,
 		    is_ipv4 ? (uintptr_t)&src_ip :
 		        (uintptr_t)&args->f_id.src_ip6,
 		    is_ipv4 ? (uintptr_t)&dst_ip :
 		        (uintptr_t)&args->f_id.dst_ip6,
 		    args, rule);
 	} else {
 		retval = IP_FW_DENY;
 		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
 	}
 	IPFW_PF_RUNLOCK(chain);
 #ifdef __FreeBSD__
 	if (ucred_cache != NULL)
 		crfree(ucred_cache);
 #endif
 	return (retval);
 
 pullup_failed:
 	if (V_fw_verbose)
 		printf("ipfw: pullup failed\n");
 	return (IP_FW_DENY);
 }
 
 /*
  * Set maximum number of tables that can be used in given VNET ipfw instance.
  */
 #ifdef SYSCTL_NODE
 static int
 sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int ntables;
 
 	ntables = V_fw_tables_max;
 
 	error = sysctl_handle_int(oidp, &ntables, 0, req);
 	/* Read operation or some error */
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	return (ipfw_resize_tables(&V_layer3_chain, ntables));
 }
 
 /*
  * Switches table namespace between global and per-set.
  */
 static int
 sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int sets;
 
 	sets = V_fw_tables_sets;
 
 	error = sysctl_handle_int(oidp, &sets, 0, req);
 	/* Read operation or some error */
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	return (ipfw_switch_tables_namespace(&V_layer3_chain, sets));
 }
 #endif
 
 /*
  * Module and VNET glue
  */
 
 /*
  * Stuff that must be initialised only on boot or module load
  */
 static int
 ipfw_init(void)
 {
 	int error = 0;
 
 	/*
  	 * Only print out this stuff the first time around,
 	 * when called from the sysinit code.
 	 */
 	printf("ipfw2 "
 #ifdef INET6
 		"(+ipv6) "
 #endif
 		"initialized, divert %s, nat %s, "
 		"default to %s, logging ",
 #ifdef IPDIVERT
 		"enabled",
 #else
 		"loadable",
 #endif
 #ifdef IPFIREWALL_NAT
 		"enabled",
 #else
 		"loadable",
 #endif
 		default_to_accept ? "accept" : "deny");
 
 	/*
 	 * Note: V_xxx variables can be accessed here but the vnet specific
 	 * initializer may not have been called yet for the VIMAGE case.
 	 * Tuneables will have been processed. We will print out values for
 	 * the default vnet. 
 	 * XXX This should all be rationalized AFTER 8.0
 	 */
 	if (V_fw_verbose == 0)
 		printf("disabled\n");
 	else if (V_verbose_limit == 0)
 		printf("unlimited\n");
 	else
 		printf("limited to %d packets/entry by default\n",
 		    V_verbose_limit);
 
 	/* Check user-supplied table count for validness */
 	if (default_fw_tables > IPFW_TABLES_MAX)
 	  default_fw_tables = IPFW_TABLES_MAX;
 
 	ipfw_init_sopt_handler();
 	ipfw_init_obj_rewriter();
 	ipfw_iface_init();
 	return (error);
 }
 
 /*
  * Called for the removal of the last instance only on module unload.
  */
 static void
 ipfw_destroy(void)
 {
 
 	ipfw_iface_destroy();
 	ipfw_destroy_sopt_handler();
 	ipfw_destroy_obj_rewriter();
 	printf("IP firewall unloaded\n");
 }
 
 /*
  * Stuff that must be initialized for every instance
  * (including the first of course).
  */
 static int
 vnet_ipfw_init(const void *unused)
 {
 	int error, first;
 	struct ip_fw *rule = NULL;
 	struct ip_fw_chain *chain;
 
 	chain = &V_layer3_chain;
 
 	first = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
 
 	/* First set up some values that are compile time options */
 	V_autoinc_step = 100;	/* bounded to 1..1000 in add_rule() */
 	V_fw_deny_unknown_exthdrs = 1;
 #ifdef IPFIREWALL_VERBOSE
 	V_fw_verbose = 1;
 #endif
 #ifdef IPFIREWALL_VERBOSE_LIMIT
 	V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
 #endif
 #ifdef IPFIREWALL_NAT
 	LIST_INIT(&chain->nat);
 #endif
 
 	/* Init shared services hash table */
 	ipfw_init_srv(chain);
 
 	ipfw_init_counters();
 	/* Set initial number of tables */
 	V_fw_tables_max = default_fw_tables;
 	error = ipfw_init_tables(chain, first);
 	if (error) {
 		printf("ipfw2: setting up tables failed\n");
 		free(chain->map, M_IPFW);
 		free(rule, M_IPFW);
 		return (ENOSPC);
 	}
 
 	IPFW_LOCK_INIT(chain);
 
 	/* fill and insert the default rule */
 	rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw));
 	rule->flags |= IPFW_RULE_NOOPT;
 	rule->cmd_len = 1;
 	rule->cmd[0].len = 1;
 	rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
 	chain->default_rule = rule;
 	ipfw_add_protected_rule(chain, rule, 0);
 
 	ipfw_dyn_init(chain);
 	ipfw_eaction_init(chain, first);
 #ifdef LINEAR_SKIPTO
 	ipfw_init_skipto_cache(chain);
 #endif
 	ipfw_bpf_init(first);
 
 	/* First set up some values that are compile time options */
 	V_ipfw_vnet_ready = 1;		/* Open for business */
 
 	/*
 	 * Hook the sockopt handler and pfil hooks for ipv4 and ipv6.
 	 * Even if the latter two fail we still keep the module alive
 	 * because the sockopt and layer2 paths are still useful.
 	 * ipfw[6]_hook return 0 on success, ENOENT on failure,
 	 * so we can ignore the exact return value and just set a flag.
 	 *
 	 * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
 	 * changes in the underlying (per-vnet) variables trigger
 	 * immediate hook()/unhook() calls.
 	 * In layer2 we have the same behaviour, except that V_ether_ipfw
 	 * is checked on each packet because there are no pfil hooks.
 	 */
 	V_ip_fw_ctl_ptr = ipfw_ctl3;
 	error = ipfw_attach_hooks();
 	return (error);
 }
 
 /*
  * Called for the removal of each instance.
  */
 static int
 vnet_ipfw_uninit(const void *unused)
 {
 	struct ip_fw *reap;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 	int i, last;
 
 	V_ipfw_vnet_ready = 0; /* tell new callers to go away */
 	/*
 	 * disconnect from ipv4, ipv6, layer2 and sockopt.
 	 * Then grab, release and grab again the WLOCK so we make
 	 * sure the update is propagated and nobody will be in.
 	 */
 	ipfw_detach_hooks();
 	V_ip_fw_ctl_ptr = NULL;
 
 	last = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
 
 	IPFW_UH_WLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 
 	ipfw_dyn_uninit(0);	/* run the callout_drain */
 
 	IPFW_UH_WLOCK(chain);
 
 	reap = NULL;
 	IPFW_WLOCK(chain);
 	for (i = 0; i < chain->n_rules; i++)
 		ipfw_reap_add(chain, &reap, chain->map[i]);
 	free(chain->map, M_IPFW);
 #ifdef LINEAR_SKIPTO
 	ipfw_destroy_skipto_cache(chain);
 #endif
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 	ipfw_destroy_tables(chain, last);
 	ipfw_eaction_uninit(chain, last);
 	if (reap != NULL)
 		ipfw_reap_rules(reap);
 	vnet_ipfw_iface_destroy(chain);
 	ipfw_destroy_srv(chain);
 	IPFW_LOCK_DESTROY(chain);
 	ipfw_dyn_uninit(1);	/* free the remaining parts */
 	ipfw_destroy_counters();
 	ipfw_bpf_uninit(last);
 	return (0);
 }
 
 /*
  * Module event handler.
  * In general we have the choice of handling most of these events by the
  * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to
  * use the SYSINIT handlers as they are more capable of expressing the
  * flow of control during module and vnet operations, so this is just
  * a skeleton. Note there is no SYSINIT equivalent of the module
  * SHUTDOWN handler, but we don't have anything to do in that case anyhow.
  */
 static int
 ipfw_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		/* Called once at module load or
 	 	 * system boot if compiled in. */
 		break;
 	case MOD_QUIESCE:
 		/* Called before unload. May veto unloading. */
 		break;
 	case MOD_UNLOAD:
 		/* Called during unload. */
 		break;
 	case MOD_SHUTDOWN:
 		/* Called during system shutdown. */
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipfwmod = {
 	"ipfw",
 	ipfw_modevent,
 	0
 };
 
 /* Define startup order. */
 #define	IPFW_SI_SUB_FIREWALL	SI_SUB_PROTO_FIREWALL
 #define	IPFW_MODEVENT_ORDER	(SI_ORDER_ANY - 255) /* On boot slot in here. */
 #define	IPFW_MODULE_ORDER	(IPFW_MODEVENT_ORDER + 1) /* A little later. */
 #define	IPFW_VNET_ORDER		(IPFW_MODEVENT_ORDER + 2) /* Later still. */
 
 DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
 FEATURE(ipfw_ctl3, "ipfw new sockopt calls");
 MODULE_VERSION(ipfw, 3);
 /* should declare some dependencies here */
 
 /*
  * Starting up. Done in order after ipfwmod() has been called.
  * VNET_SYSINIT is also called for each existing vnet and each new vnet.
  */
 SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
 	    ipfw_init, NULL);
 VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
 	    vnet_ipfw_init, NULL);
 
 /*
  * Closing up shop. These are done in REVERSE ORDER, but still
  * after ipfwmod() has been called. Not called on reboot.
  * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
  * or when the module is unloaded.
  */
 SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
 	    ipfw_destroy, NULL);
 VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
 	    vnet_ipfw_uninit, NULL);
 /* end of file */
diff --git a/sys/netpfil/ipfw/ip_fw_bpf.c b/sys/netpfil/ipfw/ip_fw_bpf.c
index a6a6be95e507..29f988b7525a 100644
--- a/sys/netpfil/ipfw/ip_fw_bpf.c
+++ b/sys/netpfil/ipfw/ip_fw_bpf.c
@@ -1,208 +1,209 @@
 /*-
  * Copyright (c) 2016 Yandex LLC
  * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/socket.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_pflog.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/vnet.h>
 #include <net/bpf.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_var.h>
 #include <netpfil/ipfw/ip_fw_private.h>
 
 VNET_DEFINE_STATIC(struct ifnet *, log_if);
 VNET_DEFINE_STATIC(struct ifnet *, pflog_if);
 VNET_DEFINE_STATIC(struct if_clone *, ipfw_cloner);
 VNET_DEFINE_STATIC(struct if_clone *, ipfwlog_cloner);
 #define	V_ipfw_cloner		VNET(ipfw_cloner)
 #define	V_ipfwlog_cloner	VNET(ipfwlog_cloner)
 #define	V_log_if		VNET(log_if)
 #define	V_pflog_if		VNET(pflog_if)
 
 static const char ipfwname[] = "ipfw";
 static const char ipfwlogname[] = "ipfwlog";
 
 static int
 ipfw_bpf_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr)
 {
 
 	return (EINVAL);
 }
 
 static int
 ipfw_bpf_output(struct ifnet *ifp, struct mbuf *m,
 	const struct sockaddr *dst, struct route *ro)
 {
 
 	if (m != NULL)
 		FREE_PKT(m);
 	return (0);
 }
 
 static void
 ipfw_clone_destroy(struct ifnet *ifp)
 {
 
 	if (ifp->if_hdrlen == ETHER_HDR_LEN)
 		V_log_if = NULL;
 	else
 		V_pflog_if = NULL;
 
 	NET_EPOCH_WAIT();
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
 }
 
 static int
 ipfw_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct ifnet *ifp;
 
 	ifp = if_alloc(IFT_PFLOG);
 	if (ifp == NULL)
 		return (ENOSPC);
 	if_initname(ifp, ipfwname, unit);
 	ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_mtu = 65536;
 	ifp->if_ioctl = ipfw_bpf_ioctl;
 	ifp->if_output = ipfw_bpf_output;
 	ifp->if_hdrlen = ETHER_HDR_LEN;
 	if_attach(ifp);
 	bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN);
 	if (V_log_if != NULL) {
 		bpfdetach(ifp);
 		if_detach(ifp);
 		if_free(ifp);
 		return (EEXIST);
 	}
 	V_log_if = ifp;
 	return (0);
 }
 
 static int
 ipfwlog_clone_create(struct if_clone *ifc, int unit, caddr_t params)
 {
 	struct ifnet *ifp;
 
 	ifp = if_alloc(IFT_PFLOG);
 	if (ifp == NULL)
 		return (ENOSPC);
 	if_initname(ifp, ipfwlogname, unit);
 	ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_mtu = 65536;
 	ifp->if_ioctl = ipfw_bpf_ioctl;
 	ifp->if_output = ipfw_bpf_output;
 	ifp->if_hdrlen = PFLOG_HDRLEN;
 	if_attach(ifp);
 	bpfattach(ifp, DLT_PFLOG, PFLOG_HDRLEN);
 	if (V_pflog_if != NULL) {
 		bpfdetach(ifp);
 		if_detach(ifp);
 		if_free(ifp);
 		return (EEXIST);
 	}
 	V_pflog_if = ifp;
 	return (0);
 }
 
 void
 ipfw_bpf_tap(u_char *pkt, u_int pktlen)
 {
 	struct ifnet *ifp = V_log_if;
 
 	NET_EPOCH_ASSERT();
 	if (ifp != NULL)
 		BPF_TAP(ifp, pkt, pktlen);
 }
 
 void
 ipfw_bpf_mtap(struct mbuf *m)
 {
 	struct ifnet *ifp = V_log_if;
 
 	NET_EPOCH_ASSERT();
 	if (ifp != NULL)
 		BPF_MTAP(ifp, m);
 }
 
 void
 ipfw_bpf_mtap2(void *data, u_int dlen, struct mbuf *m)
 {
 	struct ifnet *logif;
 
 	NET_EPOCH_ASSERT();
 	switch (dlen) {
 	case (ETHER_HDR_LEN):
 		logif = V_log_if;
 		break;
 	case (PFLOG_HDRLEN):
 		logif = V_pflog_if;
 		break;
 	default:
 #ifdef INVARIANTS
 		panic("%s: unsupported len %d", __func__, dlen);
 #endif
 		logif = NULL;
 	}
 
 	if (logif != NULL)
 		BPF_MTAP2(logif, data, dlen, m);
 }
 
 void
 ipfw_bpf_init(int first __unused)
 {
 
 	V_log_if = NULL;
 	V_pflog_if = NULL;
 	V_ipfw_cloner = if_clone_simple(ipfwname, ipfw_clone_create,
 	    ipfw_clone_destroy, 0);
 	V_ipfwlog_cloner = if_clone_simple(ipfwlogname, ipfwlog_clone_create,
 	    ipfw_clone_destroy, 0);
 }
 
 void
 ipfw_bpf_uninit(int last __unused)
 {
 
 	if_clone_detach(V_ipfw_cloner);
 	if_clone_detach(V_ipfwlog_cloner);
 }
diff --git a/sys/netpfil/ipfw/ip_fw_iface.c b/sys/netpfil/ipfw/ip_fw_iface.c
index 3e366db3af9a..83dc54e918b3 100644
--- a/sys/netpfil/ipfw/ip_fw_iface.c
+++ b/sys/netpfil/ipfw/ip_fw_iface.c
@@ -1,537 +1,538 @@
 /*-
  * Copyright (c) 2014 Yandex LLC.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Kernel interface tracking API.
  *
  */
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/queue.h>
 #include <sys/eventhandler.h>
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_var.h>	/* struct ipfw_rule_ref */
 #include <netinet/ip_fw.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #define	CHAIN_TO_II(ch)		((struct namedobj_instance *)ch->ifcfg)
 
 #define	DEFAULT_IFACES	128
 
 static void handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif,
     uint16_t ifindex);
 static void handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif,
     uint16_t ifindex);
 static int list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 
 static struct ipfw_sopt_handler	scodes[] = {
 	{ IP_FW_XIFLIST,	0,	HDIR_GET,	list_ifaces },
 };
 
 /*
  * FreeBSD Kernel interface.
  */
 static void ipfw_kifhandler(void *arg, struct ifnet *ifp);
 static int ipfw_kiflookup(char *name);
 static void iface_khandler_register(void);
 static void iface_khandler_deregister(void);
 
 static eventhandler_tag ipfw_ifdetach_event, ipfw_ifattach_event;
 static int num_vnets = 0;
 static struct mtx vnet_mtx;
 
 /*
  * Checks if kernel interface is contained in our tracked
  * interface list and calls attach/detach handler.
  */
 static void
 ipfw_kifhandler(void *arg, struct ifnet *ifp)
 {
 	struct ip_fw_chain *ch;
 	struct ipfw_iface *iif;
 	struct namedobj_instance *ii;
 	uintptr_t htype;
 
 	if (V_ipfw_vnet_ready == 0)
 		return;
 
 	ch = &V_layer3_chain;
 	htype = (uintptr_t)arg;
 
 	IPFW_UH_WLOCK(ch);
 	ii = CHAIN_TO_II(ch);
 	if (ii == NULL) {
 		IPFW_UH_WUNLOCK(ch);
 		return;
 	}
 	iif = (struct ipfw_iface*)ipfw_objhash_lookup_name(ii, 0,
 	    if_name(ifp));
 	if (iif != NULL) {
 		if (htype == 1)
 			handle_ifattach(ch, iif, ifp->if_index);
 		else
 			handle_ifdetach(ch, iif, ifp->if_index);
 	}
 	IPFW_UH_WUNLOCK(ch);
 }
 
 /*
  * Reference current VNET as iface tracking API user.
  * Registers interface tracking handlers for first VNET.
  */
 static void
 iface_khandler_register(void)
 {
 	int create;
 
 	create = 0;
 
 	mtx_lock(&vnet_mtx);
 	if (num_vnets == 0)
 		create = 1;
 	num_vnets++;
 	mtx_unlock(&vnet_mtx);
 
 	if (create == 0)
 		return;
 
 	printf("IPFW: starting up interface tracker\n");
 
 	ipfw_ifdetach_event = EVENTHANDLER_REGISTER(
 	    ifnet_departure_event, ipfw_kifhandler, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	ipfw_ifattach_event = EVENTHANDLER_REGISTER(
 	    ifnet_arrival_event, ipfw_kifhandler, (void*)((uintptr_t)1),
 	    EVENTHANDLER_PRI_ANY);
 }
 
 /*
  *
  * Detach interface event handlers on last VNET instance
  * detach.
  */
 static void
 iface_khandler_deregister(void)
 {
 	int destroy;
 
 	destroy = 0;
 	mtx_lock(&vnet_mtx);
 	if (num_vnets == 1)
 		destroy = 1;
 	num_vnets--;
 	mtx_unlock(&vnet_mtx);
 
 	if (destroy == 0)
 		return;
 
 	EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
 	    ipfw_ifattach_event);
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 	    ipfw_ifdetach_event);
 }
 
 /*
  * Retrieves ifindex for given @name.
  *
  * Returns ifindex or 0.
  */
 static int
 ipfw_kiflookup(char *name)
 {
 	struct ifnet *ifp;
 	int ifindex;
 
 	ifindex = 0;
 
 	if ((ifp = ifunit_ref(name)) != NULL) {
 		ifindex = ifp->if_index;
 		if_rele(ifp);
 	}
 
 	return (ifindex);
 }
 
 /*
  * Global ipfw startup hook.
  * Since we perform lazy initialization, do nothing except
  * mutex init.
  */
 int
 ipfw_iface_init(void)
 {
 
 	mtx_init(&vnet_mtx, "IPFW ifhandler mtx", NULL, MTX_DEF);
 	IPFW_ADD_SOPT_HANDLER(1, scodes);
 	return (0);
 }
 
 /*
  * Global ipfw destroy hook.
  * Unregister khandlers iff init has been done.
  */
 void
 ipfw_iface_destroy(void)
 {
 
 	IPFW_DEL_SOPT_HANDLER(1, scodes);
 	mtx_destroy(&vnet_mtx);
 }
 
 /*
  * Perform actual init on internal request.
  * Inits both namehash and global khandler.
  */
 static void
 vnet_ipfw_iface_init(struct ip_fw_chain *ch)
 {
 	struct namedobj_instance *ii;
 
 	ii = ipfw_objhash_create(DEFAULT_IFACES);
 	IPFW_UH_WLOCK(ch);
 	if (ch->ifcfg == NULL) {
 		ch->ifcfg = ii;
 		ii = NULL;
 	}
 	IPFW_UH_WUNLOCK(ch);
 
 	if (ii != NULL) {
 		/* Already initialized. Free namehash. */
 		ipfw_objhash_destroy(ii);
 	} else {
 		/* We're the first ones. Init kernel hooks. */
 		iface_khandler_register();
 	}
 }
 
 static int
 destroy_iface(struct namedobj_instance *ii, struct named_object *no,
     void *arg)
 {
 
 	/* Assume all consumers have been already detached */
 	free(no, M_IPFW);
 	return (0);
 }
 
 /*
  * Per-VNET ipfw detach hook.
  *
  */
 void
 vnet_ipfw_iface_destroy(struct ip_fw_chain *ch)
 {
 	struct namedobj_instance *ii;
 
 	IPFW_UH_WLOCK(ch);
 	ii = CHAIN_TO_II(ch);
 	ch->ifcfg = NULL;
 	IPFW_UH_WUNLOCK(ch);
 
 	if (ii != NULL) {
 		ipfw_objhash_foreach(ii, destroy_iface, ch);
 		ipfw_objhash_destroy(ii);
 		iface_khandler_deregister();
 	}
 }
 
 /*
  * Notify the subsystem that we are interested in tracking
  * interface @name. This function has to be called without
  * holding any locks to permit allocating the necessary states
  * for proper interface tracking.
  *
  * Returns 0 on success.
  */
 int
 ipfw_iface_ref(struct ip_fw_chain *ch, char *name,
     struct ipfw_ifc *ic)
 {
 	struct namedobj_instance *ii;
 	struct ipfw_iface *iif, *tmp;
 
 	if (strlen(name) >= sizeof(iif->ifname))
 		return (EINVAL);
 
 	IPFW_UH_WLOCK(ch);
 
 	ii = CHAIN_TO_II(ch);
 	if (ii == NULL) {
 		/*
 		 * First request to subsystem.
 		 * Let's perform init.
 		 */
 		IPFW_UH_WUNLOCK(ch);
 		vnet_ipfw_iface_init(ch);
 		IPFW_UH_WLOCK(ch);
 		ii = CHAIN_TO_II(ch);
 	}
 
 	iif = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name);
 
 	if (iif != NULL) {
 		iif->no.refcnt++;
 		ic->iface = iif;
 		IPFW_UH_WUNLOCK(ch);
 		return (0);
 	}
 
 	IPFW_UH_WUNLOCK(ch);
 
 	/* Not found. Let's create one */
 	iif = malloc(sizeof(struct ipfw_iface), M_IPFW, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&iif->consumers);
 	iif->no.name = iif->ifname;
 	strlcpy(iif->ifname, name, sizeof(iif->ifname));
 
 	/*
 	 * Ref & link to the list.
 	 *
 	 * We assume  ifnet_arrival_event / ifnet_departure_event
 	 * are not holding any locks.
 	 */
 	iif->no.refcnt = 1;
 	IPFW_UH_WLOCK(ch);
 
 	tmp = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name);
 	if (tmp != NULL) {
 		/* Interface has been created since unlock. Ref and return */
 		tmp->no.refcnt++;
 		ic->iface = tmp;
 		IPFW_UH_WUNLOCK(ch);
 		free(iif, M_IPFW);
 		return (0);
 	}
 
 	iif->ifindex = ipfw_kiflookup(name);
 	if (iif->ifindex != 0)
 		iif->resolved = 1;
 
 	ipfw_objhash_add(ii, &iif->no);
 	ic->iface = iif;
 
 	IPFW_UH_WUNLOCK(ch);
 
 	return (0);
 }
 
 /*
  * Adds @ic to the list of iif interface consumers.
  * Must be called with holding both UH+WLOCK.
  * Callback may be immediately called (if interface exists).
  */
 void
 ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic)
 {
 	struct ipfw_iface *iif;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 	IPFW_WLOCK_ASSERT(ch);
 
 	iif = ic->iface;
 
 	TAILQ_INSERT_TAIL(&iif->consumers, ic, next);
 	if (iif->resolved != 0)
 		ic->cb(ch, ic->cbdata, iif->ifindex);
 }
 
 /*
  * Unlinks interface tracker object @ic from interface.
  * Must be called while holding UH lock.
  */
 void
 ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic)
 {
 	struct ipfw_iface *iif;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	iif = ic->iface;
 	TAILQ_REMOVE(&iif->consumers, ic, next);
 }
 
 /*
  * Unreference interface specified by @ic.
  * Must be called while holding UH lock.
  */
 void
 ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic)
 {
 	struct ipfw_iface *iif;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	iif = ic->iface;
 	ic->iface = NULL;
 
 	iif->no.refcnt--;
 	/* TODO: check for references & delete */
 }
 
 /*
  * Interface arrival handler.
  */
 static void
 handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif,
     uint16_t ifindex)
 {
 	struct ipfw_ifc *ic;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	iif->gencnt++;
 	iif->resolved = 1;
 	iif->ifindex = ifindex;
 
 	IPFW_WLOCK(ch);
 	TAILQ_FOREACH(ic, &iif->consumers, next)
 		ic->cb(ch, ic->cbdata, iif->ifindex);
 	IPFW_WUNLOCK(ch);
 }
 
 /*
  * Interface departure handler.
  */
 static void
 handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif,
     uint16_t ifindex)
 {
 	struct ipfw_ifc *ic;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	IPFW_WLOCK(ch);
 	TAILQ_FOREACH(ic, &iif->consumers, next)
 		ic->cb(ch, ic->cbdata, 0);
 	IPFW_WUNLOCK(ch);
 
 	iif->gencnt++;
 	iif->resolved = 0;
 	iif->ifindex = 0;
 }
 
 struct dump_iface_args {
 	struct ip_fw_chain *ch;
 	struct sockopt_data *sd;
 };
 
 static int
 export_iface_internal(struct namedobj_instance *ii, struct named_object *no,
     void *arg)
 {
 	ipfw_iface_info *i;
 	struct dump_iface_args *da;
 	struct ipfw_iface *iif;
 
 	da = (struct dump_iface_args *)arg;
 
 	i = (ipfw_iface_info *)ipfw_get_sopt_space(da->sd, sizeof(*i));
 	KASSERT(i != NULL, ("previously checked buffer is not enough"));
 
 	iif = (struct ipfw_iface *)no;
 
 	strlcpy(i->ifname, iif->ifname, sizeof(i->ifname));
 	if (iif->resolved)
 		i->flags |= IPFW_IFFLAG_RESOLVED;
 	i->ifindex = iif->ifindex;
 	i->refcnt = iif->no.refcnt;
 	i->gencnt = iif->gencnt;
 	return (0);
 }
 
 /*
  * Lists all interface currently tracked by ipfw.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
  * Reply: [ ipfw_obj_lheader ipfw_iface_info x N ]
  *
  * Returns 0 on success
  */
 static int
 list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	struct namedobj_instance *ii;
 	struct _ipfw_obj_lheader *olh;
 	struct dump_iface_args da;
 	uint32_t count, size;
 
 	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
 	if (olh == NULL)
 		return (EINVAL);
 	if (sd->valsize < olh->size)
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(ch);
 	ii = CHAIN_TO_II(ch);
 	if (ii != NULL)
 		count = ipfw_objhash_count(ii);
 	else
 		count = 0;
 	size = count * sizeof(ipfw_iface_info) + sizeof(ipfw_obj_lheader);
 
 	/* Fill in header regadless of buffer size */
 	olh->count = count;
 	olh->objsize = sizeof(ipfw_iface_info);
 
 	if (size > olh->size) {
 		olh->size = size;
 		IPFW_UH_RUNLOCK(ch);
 		return (ENOMEM);
 	}
 	olh->size = size;
 
 	da.ch = ch;
 	da.sd = sd;
 
 	if (ii != NULL)
 		ipfw_objhash_foreach(ii, export_iface_internal, &da);
 	IPFW_UH_RUNLOCK(ch);
 
 	return (0);
 }
diff --git a/sys/netpfil/ipfw/ip_fw_log.c b/sys/netpfil/ipfw/ip_fw_log.c
index d230d243a0f7..b5d2f998adc8 100644
--- a/sys/netpfil/ipfw/ip_fw_log.c
+++ b/sys/netpfil/ipfw/ip_fw_log.c
@@ -1,426 +1,427 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Logging support for ipfw
  */
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <net/ethernet.h> /* for ETHERTYPE_IP */
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/udp.h>
 #include <netinet/tcp.h>
 
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #ifdef INET6
 #include <netinet6/in6_var.h>	/* ip6_sprintf() */
 #endif
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 /*
  * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
  * Other macros just cast void * into the appropriate type
  */
 #define	L3HDR(T, ip)	((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
 #define	TCP(p)		((struct tcphdr *)(p))
 #define	SCTP(p)		((struct sctphdr *)(p))
 #define	UDP(p)		((struct udphdr *)(p))
 #define	ICMP(p)		((struct icmphdr *)(p))
 #define	ICMP6(p)	((struct icmp6_hdr *)(p))
 
 #ifdef __APPLE__
 #undef snprintf
 #define snprintf	sprintf
 #define SNPARGS(buf, len) buf + len
 #define SNP(buf) buf
 #else	/* !__APPLE__ */
 #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0
 #define SNP(buf) buf, sizeof(buf)
 #endif /* !__APPLE__ */
 
 #define	TARG(k, f)	IP_FW_ARG_TABLEARG(chain, k, f)
 /*
  * We enter here when we have a rule with O_LOG.
  * XXX this function alone takes about 2Kbytes of code!
  */
 void
 ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen,
     struct ip_fw_args *args, u_short offset, uint32_t tablearg, struct ip *ip)
 {
 	char *action;
 	int limit_reached = 0;
 	char action2[92], proto[128], fragment[32];
 
 	if (V_fw_verbose == 0) {
 		if (args->flags & IPFW_ARGS_LENMASK)
 			ipfw_bpf_tap(args->mem, IPFW_ARGS_LENGTH(args->flags));
 		else if (args->flags & IPFW_ARGS_ETHER)
 			/* layer2, use orig hdr */
 			ipfw_bpf_mtap(args->m);
 		else {
 			/* Add fake header. Later we will store
 			 * more info in the header.
 			 */
 			if (ip->ip_v == 4)
 				ipfw_bpf_mtap2("DDDDDDSSSSSS\x08\x00",
 				    ETHER_HDR_LEN, args->m);
 			else if (ip->ip_v == 6)
 				ipfw_bpf_mtap2("DDDDDDSSSSSS\x86\xdd",
 				    ETHER_HDR_LEN, args->m);
 			else
 				/* Obviously bogus EtherType. */
 				ipfw_bpf_mtap2("DDDDDDSSSSSS\xff\xff",
 				    ETHER_HDR_LEN, args->m);
 		}
 		return;
 	}
 	/* the old 'log' function */
 	fragment[0] = '\0';
 	proto[0] = '\0';
 
 	if (f == NULL) {	/* bogus pkt */
 		if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit)
 			return;
 		V_norule_counter++;
 		if (V_norule_counter == V_verbose_limit)
 			limit_reached = V_verbose_limit;
 		action = "Refuse";
 	} else {	/* O_LOG is the first action, find the real one */
 		ipfw_insn *cmd = ACTION_PTR(f);
 		ipfw_insn_log *l = (ipfw_insn_log *)cmd;
 
 		if (l->max_log != 0 && l->log_left == 0)
 			return;
 		l->log_left--;
 		if (l->log_left == 0)
 			limit_reached = l->max_log;
 		cmd += F_LEN(cmd);	/* point to first action */
 		if (cmd->opcode == O_ALTQ) {
 			ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
 
 			snprintf(SNPARGS(action2, 0), "Altq %d",
 				altq->qid);
 			cmd += F_LEN(cmd);
 		}
 		if (cmd->opcode == O_PROB || cmd->opcode == O_TAG)
 			cmd += F_LEN(cmd);
 
 		action = action2;
 		switch (cmd->opcode) {
 		case O_DENY:
 			action = "Deny";
 			break;
 
 		case O_REJECT:
 			if (cmd->arg1==ICMP_REJECT_RST)
 				action = "Reset";
 			else if (cmd->arg1==ICMP_REJECT_ABORT)
 				action = "Abort";
 			else if (cmd->arg1==ICMP_UNREACH_HOST)
 				action = "Reject";
 			else
 				snprintf(SNPARGS(action2, 0), "Unreach %d",
 					cmd->arg1);
 			break;
 
 		case O_UNREACH6:
 			if (cmd->arg1==ICMP6_UNREACH_RST)
 				action = "Reset";
 			else if (cmd->arg1==ICMP6_UNREACH_ABORT)
 				action = "Abort";
 			else
 				snprintf(SNPARGS(action2, 0), "Unreach %d",
 					cmd->arg1);
 			break;
 
 		case O_ACCEPT:
 			action = "Accept";
 			break;
 		case O_COUNT:
 			action = "Count";
 			break;
 		case O_DIVERT:
 			snprintf(SNPARGS(action2, 0), "Divert %d",
 				TARG(cmd->arg1, divert));
 			break;
 		case O_TEE:
 			snprintf(SNPARGS(action2, 0), "Tee %d",
 				TARG(cmd->arg1, divert));
 			break;
 		case O_SETDSCP:
 			snprintf(SNPARGS(action2, 0), "SetDscp %d",
 				TARG(cmd->arg1, dscp) & 0x3F);
 			break;
 		case O_SETFIB:
 			snprintf(SNPARGS(action2, 0), "SetFib %d",
 				TARG(cmd->arg1, fib) & 0x7FFF);
 			break;
 		case O_SKIPTO:
 			snprintf(SNPARGS(action2, 0), "SkipTo %d",
 				TARG(cmd->arg1, skipto));
 			break;
 		case O_PIPE:
 			snprintf(SNPARGS(action2, 0), "Pipe %d",
 				TARG(cmd->arg1, pipe));
 			break;
 		case O_QUEUE:
 			snprintf(SNPARGS(action2, 0), "Queue %d",
 				TARG(cmd->arg1, pipe));
 			break;
 		case O_FORWARD_IP: {
 			char buf[INET_ADDRSTRLEN];
 			ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd;
 			int len;
 			struct in_addr dummyaddr;
 			if (sa->sa.sin_addr.s_addr == INADDR_ANY)
 				dummyaddr.s_addr = htonl(tablearg);
 			else
 				dummyaddr.s_addr = sa->sa.sin_addr.s_addr;
 
 			len = snprintf(SNPARGS(action2, 0), "Forward to %s",
 				inet_ntoa_r(dummyaddr, buf));
 
 			if (sa->sa.sin_port)
 				snprintf(SNPARGS(action2, len), ":%d",
 				    sa->sa.sin_port);
 			}
 			break;
 #ifdef INET6
 		case O_FORWARD_IP6: {
 			char buf[INET6_ADDRSTRLEN];
 			ipfw_insn_sa6 *sa = (ipfw_insn_sa6 *)cmd;
 			int len;
 
 			len = snprintf(SNPARGS(action2, 0), "Forward to [%s]",
 			    ip6_sprintf(buf, &sa->sa.sin6_addr));
 
 			if (sa->sa.sin6_port)
 				snprintf(SNPARGS(action2, len), ":%u",
 				    sa->sa.sin6_port);
 			}
 			break;
 #endif
 		case O_NETGRAPH:
 			snprintf(SNPARGS(action2, 0), "Netgraph %d",
 				cmd->arg1);
 			break;
 		case O_NGTEE:
 			snprintf(SNPARGS(action2, 0), "Ngtee %d",
 				cmd->arg1);
 			break;
 		case O_NAT:
 			action = "Nat";
  			break;
 		case O_REASS:
 			action = "Reass";
 			break;
 		case O_CALLRETURN:
 			if (cmd->len & F_NOT)
 				action = "Return";
 			else
 				snprintf(SNPARGS(action2, 0), "Call %d",
 				    cmd->arg1);
 			break;
 		case O_EXTERNAL_ACTION:
 			snprintf(SNPARGS(action2, 0), "Eaction %s",
 			    ((struct named_object *)SRV_OBJECT(chain,
 			    cmd->arg1))->name);
 			break;
 		default:
 			action = "UNKNOWN";
 			break;
 		}
 	}
 
 	if (hlen == 0) {	/* non-ip */
 		snprintf(SNPARGS(proto, 0), "MAC");
 
 	} else {
 		int len;
 #ifdef INET6
 		char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2];
 #else
 		char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
 #endif
 		struct icmphdr *icmp;
 		struct tcphdr *tcp;
 		struct udphdr *udp;
 #ifdef INET6
 		struct ip6_hdr *ip6 = NULL;
 		struct icmp6_hdr *icmp6;
 		u_short ip6f_mf;
 #endif
 		src[0] = '\0';
 		dst[0] = '\0';
 #ifdef INET6
 		ip6f_mf = offset & IP6F_MORE_FRAG;
 		offset &= IP6F_OFF_MASK;
 
 		if (IS_IP6_FLOW_ID(&(args->f_id))) {
 			char ip6buf[INET6_ADDRSTRLEN];
 			snprintf(src, sizeof(src), "[%s]",
 			    ip6_sprintf(ip6buf, &args->f_id.src_ip6));
 			snprintf(dst, sizeof(dst), "[%s]",
 			    ip6_sprintf(ip6buf, &args->f_id.dst_ip6));
 
 			ip6 = (struct ip6_hdr *)ip;
 			tcp = (struct tcphdr *)(((char *)ip) + hlen);
 			udp = (struct udphdr *)(((char *)ip) + hlen);
 		} else
 #endif
 		{
 			tcp = L3HDR(struct tcphdr, ip);
 			udp = L3HDR(struct udphdr, ip);
 
 			inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src));
 			inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst));
 		}
 
 		switch (args->f_id.proto) {
 		case IPPROTO_TCP:
 			len = snprintf(SNPARGS(proto, 0), "TCP %s", src);
 			if (offset == 0)
 				snprintf(SNPARGS(proto, len), ":%d %s:%d",
 				    ntohs(tcp->th_sport),
 				    dst,
 				    ntohs(tcp->th_dport));
 			else
 				snprintf(SNPARGS(proto, len), " %s", dst);
 			break;
 
 		case IPPROTO_UDP:
 		case IPPROTO_UDPLITE:
 			len = snprintf(SNPARGS(proto, 0), "UDP%s%s",
 			    args->f_id.proto == IPPROTO_UDP ? " ": "Lite ",
 			    src);
 			if (offset == 0)
 				snprintf(SNPARGS(proto, len), ":%d %s:%d",
 				    ntohs(udp->uh_sport),
 				    dst,
 				    ntohs(udp->uh_dport));
 			else
 				snprintf(SNPARGS(proto, len), " %s", dst);
 			break;
 
 		case IPPROTO_ICMP:
 			icmp = L3HDR(struct icmphdr, ip);
 			if (offset == 0)
 				len = snprintf(SNPARGS(proto, 0),
 				    "ICMP:%u.%u ",
 				    icmp->icmp_type, icmp->icmp_code);
 			else
 				len = snprintf(SNPARGS(proto, 0), "ICMP ");
 			len += snprintf(SNPARGS(proto, len), "%s", src);
 			snprintf(SNPARGS(proto, len), " %s", dst);
 			break;
 #ifdef INET6
 		case IPPROTO_ICMPV6:
 			icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen);
 			if (offset == 0)
 				len = snprintf(SNPARGS(proto, 0),
 				    "ICMPv6:%u.%u ",
 				    icmp6->icmp6_type, icmp6->icmp6_code);
 			else
 				len = snprintf(SNPARGS(proto, 0), "ICMPv6 ");
 			len += snprintf(SNPARGS(proto, len), "%s", src);
 			snprintf(SNPARGS(proto, len), " %s", dst);
 			break;
 #endif
 		default:
 			len = snprintf(SNPARGS(proto, 0), "P:%d %s",
 			    args->f_id.proto, src);
 			snprintf(SNPARGS(proto, len), " %s", dst);
 			break;
 		}
 
 #ifdef INET6
 		if (IS_IP6_FLOW_ID(&(args->f_id))) {
 			if (offset || ip6f_mf)
 				snprintf(SNPARGS(fragment, 0),
 				    " (frag %08x:%d@%d%s)",
 				    args->f_id.extra,
 				    ntohs(ip6->ip6_plen) - hlen,
 				    ntohs(offset) << 3, ip6f_mf ? "+" : "");
 		} else
 #endif
 		{
 			int ipoff, iplen;
 			ipoff = ntohs(ip->ip_off);
 			iplen = ntohs(ip->ip_len);
 			if (ipoff & (IP_MF | IP_OFFMASK))
 				snprintf(SNPARGS(fragment, 0),
 				    " (frag %d:%d@%d%s)",
 				    ntohs(ip->ip_id), iplen - (ip->ip_hl << 2),
 				    offset << 3,
 				    (ipoff & IP_MF) ? "+" : "");
 		}
 	}
 #ifdef __FreeBSD__
 	log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s %s via %s%s\n",
 	    f ? f->rulenum : -1, action, proto,
 	    args->flags & IPFW_ARGS_OUT ? "out" : "in", args->ifp->if_xname,
 	    fragment);
 #else
 	log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s [no if info]%s\n",
 	    f ? f->rulenum : -1, action, proto, fragment);
 #endif
 	if (limit_reached)
 		log(LOG_SECURITY | LOG_NOTICE,
 		    "ipfw: limit %d reached on entry %d\n",
 		    limit_reached, f ? f->rulenum : -1);
 }
 /* end of file */
diff --git a/sys/netpfil/ipfw/ip_fw_nat.c b/sys/netpfil/ipfw/ip_fw_nat.c
index 4dfe45494e2c..e802eb497bbb 100644
--- a/sys/netpfil/ipfw/ip_fw_nat.c
+++ b/sys/netpfil/ipfw/ip_fw_nat.c
@@ -1,1248 +1,1249 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008 Paolo Pisati
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 
 #include <netinet/libalias/alias.h>
 #include <netinet/libalias/alias_local.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #include <machine/in_cksum.h>	/* XXX for in_cksum */
 
 struct cfg_spool {
 	LIST_ENTRY(cfg_spool)   _next;          /* chain of spool instances */
 	struct in_addr          addr;
 	uint16_t		port;
 };
 
 /* Nat redirect configuration. */
 struct cfg_redir {
 	LIST_ENTRY(cfg_redir)	_next;	/* chain of redir instances */
 	uint16_t		mode;	/* type of redirect mode */
 	uint16_t		proto;	/* protocol: tcp/udp */
 	struct in_addr		laddr;	/* local ip address */
 	struct in_addr		paddr;	/* public ip address */
 	struct in_addr		raddr;	/* remote ip address */
 	uint16_t		lport;	/* local port */
 	uint16_t		pport;	/* public port */
 	uint16_t		rport;	/* remote port	*/
 	uint16_t		pport_cnt;	/* number of public ports */
 	uint16_t		rport_cnt;	/* number of remote ports */
 	struct alias_link	**alink;	
 	u_int16_t		spool_cnt; /* num of entry in spool chain */
 	/* chain of spool instances */
 	LIST_HEAD(spool_chain, cfg_spool) spool_chain;
 };
 
 /* Nat configuration data struct. */
 struct cfg_nat {
 	/* chain of nat instances */
 	LIST_ENTRY(cfg_nat)	_next;
 	int			id;		/* nat id  */
 	struct in_addr		ip;		/* nat ip address */
 	struct libalias		*lib;		/* libalias instance */
 	int			mode;		/* aliasing mode */
 	int			redir_cnt; /* number of entry in spool chain */
 	/* chain of redir instances */
 	LIST_HEAD(redir_chain, cfg_redir) redir_chain;  
 	char			if_name[IF_NAMESIZE];	/* interface name */
 	u_short			alias_port_lo;	/* low range for port aliasing */
 	u_short			alias_port_hi;	/* high range for port aliasing */
 };
 
 static eventhandler_tag ifaddr_event_tag;
 
 static void
 ifaddr_change(void *arg __unused, struct ifnet *ifp)
 {
 	struct cfg_nat *ptr;
 	struct ifaddr *ifa;
 	struct ip_fw_chain *chain;
 
 	KASSERT(curvnet == ifp->if_vnet,
 	    ("curvnet(%p) differs from iface vnet(%p)", curvnet, ifp->if_vnet));
 
 	if (V_ipfw_vnet_ready == 0 || V_ipfw_nat_ready == 0)
 		return;
 
 	chain = &V_layer3_chain;
 	IPFW_UH_WLOCK(chain);
 	/* Check every nat entry... */
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		struct epoch_tracker et;
 
 		/* ...using nic 'ifp->if_xname' as dynamic alias address. */
 		if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0)
 			continue;
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr == NULL)
 				continue;
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			IPFW_WLOCK(chain);
 			ptr->ip = ((struct sockaddr_in *)
 			    (ifa->ifa_addr))->sin_addr;
 			LibAliasSetAddress(ptr->lib, ptr->ip);
 			IPFW_WUNLOCK(chain);
 		}
 		NET_EPOCH_EXIT(et);
 	}
 	IPFW_UH_WUNLOCK(chain);
 }
 
 /*
  * delete the pointers for nat entry ix, or all of them if ix < 0
  */
 static void
 flush_nat_ptrs(struct ip_fw_chain *chain, const int ix)
 {
 	ipfw_insn_nat *cmd;
 	int i;
 
 	IPFW_WLOCK_ASSERT(chain);
 	for (i = 0; i < chain->n_rules; i++) {
 		cmd = (ipfw_insn_nat *)ipfw_get_action(chain->map[i]);
 		if (cmd->o.opcode == O_NAT && cmd->nat != NULL &&
 			    (ix < 0 || cmd->nat->id == ix))
 			cmd->nat = NULL;
 	}
 }
 
 static void
 del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
 {
 	struct cfg_redir *r, *tmp_r;
 	struct cfg_spool *s, *tmp_s;
 	int i, num;
 
 	LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
 		num = 1; /* Number of alias_link to delete. */
 		switch (r->mode) {
 		case NAT44_REDIR_PORT:
 			num = r->pport_cnt;
 			/* FALLTHROUGH */
 		case NAT44_REDIR_ADDR:
 		case NAT44_REDIR_PROTO:
 			/* Delete all libalias redirect entry. */
 			for (i = 0; i < num; i++)
 				LibAliasRedirectDelete(n->lib, r->alink[i]);
 			/* Del spool cfg if any. */
 			LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
 				LIST_REMOVE(s, _next);
 				free(s, M_IPFW);
 			}
 			free(r->alink, M_IPFW);
 			LIST_REMOVE(r, _next);
 			free(r, M_IPFW);
 			break;
 		default:
 			printf("unknown redirect mode: %u\n", r->mode);
 			/* XXX - panic?!?!? */
 			break;
 		}
 	}
 }
 
 static int
 add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
 {
 	struct cfg_redir *r;
 	struct cfg_spool *s;
 	struct nat44_cfg_redir *ser_r;
 	struct nat44_cfg_spool *ser_s;
 
 	int cnt, off, i;
 
 	for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
 		ser_r = (struct nat44_cfg_redir *)&buf[off];
 		r = malloc(sizeof(*r), M_IPFW, M_WAITOK | M_ZERO);
 		r->mode = ser_r->mode;
 		r->laddr = ser_r->laddr;
 		r->paddr = ser_r->paddr;
 		r->raddr = ser_r->raddr;
 		r->lport = ser_r->lport;
 		r->pport = ser_r->pport;
 		r->rport = ser_r->rport;
 		r->pport_cnt = ser_r->pport_cnt;
 		r->rport_cnt = ser_r->rport_cnt;
 		r->proto = ser_r->proto;
 		r->spool_cnt = ser_r->spool_cnt;
 		//memcpy(r, ser_r, SOF_REDIR);
 		LIST_INIT(&r->spool_chain);
 		off += sizeof(struct nat44_cfg_redir);
 		r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
 		    M_IPFW, M_WAITOK | M_ZERO);
 		switch (r->mode) {
 		case NAT44_REDIR_ADDR:
 			r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
 			    r->paddr);
 			break;
 		case NAT44_REDIR_PORT:
 			for (i = 0 ; i < r->pport_cnt; i++) {
 				/* If remotePort is all ports, set it to 0. */
 				u_short remotePortCopy = r->rport + i;
 				if (r->rport_cnt == 1 && r->rport == 0)
 					remotePortCopy = 0;
 				r->alink[i] = LibAliasRedirectPort(ptr->lib,
 				    r->laddr, htons(r->lport + i), r->raddr,
 				    htons(remotePortCopy), r->paddr,
 				    htons(r->pport + i), r->proto);
 				if (r->alink[i] == NULL) {
 					r->alink[0] = NULL;
 					break;
 				}
 			}
 			break;
 		case NAT44_REDIR_PROTO:
 			r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
 			    r->raddr, r->paddr, r->proto);
 			break;
 		default:
 			printf("unknown redirect mode: %u\n", r->mode);
 			break;
 		}
 		if (r->alink[0] == NULL) {
 			printf("LibAliasRedirect* returned NULL\n");
 			free(r->alink, M_IPFW);
 			free(r, M_IPFW);
 			return (EINVAL);
 		}
 		/* LSNAT handling. */
 		for (i = 0; i < r->spool_cnt; i++) {
 			ser_s = (struct nat44_cfg_spool *)&buf[off];
 			s = malloc(sizeof(*s), M_IPFW, M_WAITOK | M_ZERO);
 			s->addr = ser_s->addr;
 			s->port = ser_s->port;
 			LibAliasAddServer(ptr->lib, r->alink[0],
 			    s->addr, htons(s->port));
 			off += sizeof(struct nat44_cfg_spool);
 			/* Hook spool entry. */
 			LIST_INSERT_HEAD(&r->spool_chain, s, _next);
 		}
 		/* And finally hook this redir entry. */
 		LIST_INSERT_HEAD(&ptr->redir_chain, r, _next);
 	}
 
 	return (0);
 }
 
 static void
 free_nat_instance(struct cfg_nat *ptr)
 {
 
 	del_redir_spool_cfg(ptr, &ptr->redir_chain);
 	LibAliasUninit(ptr->lib);
 	free(ptr, M_IPFW);
 }
 
 /*
  * ipfw_nat - perform mbuf header translation.
  *
  * Note V_layer3_chain has to be locked while calling ipfw_nat() in
  * 'global' operation mode (t == NULL).
  *
  */
 static int
 ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
 {
 	struct mbuf *mcl;
 	struct ip *ip;
 	/* XXX - libalias duct tape */
 	int ldt, retval, found;
 	struct ip_fw_chain *chain;
 	char *c;
 
 	ldt = 0;
 	retval = 0;
 	mcl = m_megapullup(m, m->m_pkthdr.len);
 	if (mcl == NULL) {
 		args->m = NULL;
 		return (IP_FW_DENY);
 	}
 	M_ASSERTMAPPED(mcl);
 	ip = mtod(mcl, struct ip *);
 
 	/*
 	 * XXX - Libalias checksum offload 'duct tape':
 	 *
 	 * locally generated packets have only pseudo-header checksum
 	 * calculated and libalias will break it[1], so mark them for
 	 * later fix.  Moreover there are cases when libalias modifies
 	 * tcp packet data[2], mark them for later fix too.
 	 *
 	 * [1] libalias was never meant to run in kernel, so it does
 	 * not have any knowledge about checksum offloading, and
 	 * expects a packet with a full internet checksum.
 	 * Unfortunately, packets generated locally will have just the
 	 * pseudo header calculated, and when libalias tries to adjust
 	 * the checksum it will actually compute a wrong value.
 	 *
 	 * [2] when libalias modifies tcp's data content, full TCP
 	 * checksum has to be recomputed: the problem is that
 	 * libalias does not have any idea about checksum offloading.
 	 * To work around this, we do not do checksumming in LibAlias,
 	 * but only mark the packets in th_x2 field. If we receive a
 	 * marked packet, we calculate correct checksum for it
 	 * aware of offloading.  Why such a terrible hack instead of
 	 * recalculating checksum for each packet?
 	 * Because the previous checksum was not checked!
 	 * Recalculating checksums for EVERY packet will hide ALL
 	 * transmission errors. Yes, marked packets still suffer from
 	 * this problem. But, sigh, natd(8) has this problem, too.
 	 *
 	 * TODO: -make libalias mbuf aware (so
 	 * it can handle delayed checksum and tso)
 	 */
 
 	if (mcl->m_pkthdr.rcvif == NULL &&
 	    mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
 		ldt = 1;
 
 	c = mtod(mcl, char *);
 
 	/* Check if this is 'global' instance */
 	if (t == NULL) {
 		if (args->flags & IPFW_ARGS_IN) {
 			/* Wrong direction, skip processing */
 			args->m = mcl;
 			return (IP_FW_NAT);
 		}
 
 		found = 0;
 		chain = &V_layer3_chain;
 		IPFW_RLOCK_ASSERT(chain);
 		/* Check every nat entry... */
 		LIST_FOREACH(t, &chain->nat, _next) {
 			if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0)
 				continue;
 			retval = LibAliasOutTry(t->lib, c,
 			    mcl->m_len + M_TRAILINGSPACE(mcl), 0);
 			if (retval == PKT_ALIAS_OK) {
 				/* Nat instance recognises state */
 				found = 1;
 				break;
 			}
 		}
 		if (found != 1) {
 			/* No instance found, return ignore */
 			args->m = mcl;
 			return (IP_FW_NAT);
 		}
 	} else {
 		if (args->flags & IPFW_ARGS_IN)
 			retval = LibAliasIn(t->lib, c,
 				mcl->m_len + M_TRAILINGSPACE(mcl));
 		else
 			retval = LibAliasOut(t->lib, c,
 				mcl->m_len + M_TRAILINGSPACE(mcl));
 	}
 
 	/*
 	 * We drop packet when:
 	 * 1. libalias returns PKT_ALIAS_ERROR;
 	 * 2. For incoming packets:
 	 *	a) for unresolved fragments;
 	 *	b) libalias returns PKT_ALIAS_IGNORED and
 	 *		PKT_ALIAS_DENY_INCOMING flag is set.
 	 */
 	if (retval == PKT_ALIAS_ERROR ||
 	    ((args->flags & IPFW_ARGS_IN) &&
 	    (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT ||
 	    (retval == PKT_ALIAS_IGNORED &&
 	    (t->mode & PKT_ALIAS_DENY_INCOMING) != 0)))) {
 		/* XXX - should i add some logging? */
 		m_free(mcl);
 		args->m = NULL;
 		return (IP_FW_DENY);
 	}
 
 	if (retval == PKT_ALIAS_RESPOND)
 		mcl->m_flags |= M_SKIP_FIREWALL;
 	mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len);
 
 	/*
 	 * XXX - libalias checksum offload
 	 * 'duct tape' (see above)
 	 */
 
 	if ((ip->ip_off & htons(IP_OFFMASK)) == 0 &&
 	    ip->ip_p == IPPROTO_TCP) {
 		struct tcphdr 	*th;
 
 		th = (struct tcphdr *)(ip + 1);
 		if (th->th_x2 & (TH_RES1 >> 8))
 			ldt = 1;
 	}
 
 	if (ldt) {
 		struct tcphdr 	*th;
 		struct udphdr 	*uh;
 		uint16_t ip_len, cksum;
 
 		ip_len = ntohs(ip->ip_len);
 		cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(ip->ip_p + ip_len - (ip->ip_hl << 2)));
 
 		switch (ip->ip_p) {
 		case IPPROTO_TCP:
 			th = (struct tcphdr *)(ip + 1);
 			/*
 			 * Maybe it was set in
 			 * libalias...
 			 */
 			th->th_x2 &= ~(TH_RES1 >> 8);
 			th->th_sum = cksum;
 			mcl->m_pkthdr.csum_data =
 			    offsetof(struct tcphdr, th_sum);
 			break;
 		case IPPROTO_UDP:
 			uh = (struct udphdr *)(ip + 1);
 			uh->uh_sum = cksum;
 			mcl->m_pkthdr.csum_data =
 			    offsetof(struct udphdr, uh_sum);
 			break;
 		}
 		/* No hw checksum offloading: do it ourselves */
 		if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) {
 			in_delayed_cksum(mcl);
 			mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 		}
 	}
 	args->m = mcl;
 	return (IP_FW_NAT);
 }
 
 static struct cfg_nat *
 lookup_nat(struct nat_list *l, int nat_id)
 {
 	struct cfg_nat *res;
 
 	LIST_FOREACH(res, l, _next) {
 		if (res->id == nat_id)
 			break;
 	}
 	return res;
 }
 
 static struct cfg_nat *
 lookup_nat_name(struct nat_list *l, char *name)
 {
 	struct cfg_nat *res;
 	int id;
 	char *errptr;
 
 	id = strtol(name, &errptr, 10);
 	if (id == 0 || *errptr != '\0')
 		return (NULL);
 
 	LIST_FOREACH(res, l, _next) {
 		if (res->id == id)
 			break;
 	}
 	return (res);
 }
 
 /* IP_FW3 configuration routines */
 
 static void
 nat44_config(struct ip_fw_chain *chain, struct nat44_cfg_nat *ucfg)
 {
 	struct cfg_nat *ptr, *tcfg;
 	int gencnt;
 
 	/*
 	 * Find/create nat rule.
 	 */
 	IPFW_UH_WLOCK(chain);
 	gencnt = chain->gencnt;
 	ptr = lookup_nat_name(&chain->nat, ucfg->name);
 	if (ptr == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		/* New rule: allocate and init new instance. */
 		ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_WAITOK | M_ZERO);
 		ptr->lib = LibAliasInit(NULL);
 		LIST_INIT(&ptr->redir_chain);
 	} else {
 		/* Entry already present: temporarily unhook it. */
 		IPFW_WLOCK(chain);
 		LIST_REMOVE(ptr, _next);
 		flush_nat_ptrs(chain, ptr->id);
 		IPFW_WUNLOCK(chain);
 		IPFW_UH_WUNLOCK(chain);
 	}
 
 	/*
 	 * Basic nat (re)configuration.
 	 */
 	ptr->id = strtol(ucfg->name, NULL, 10);
 	/*
 	 * XXX - what if this rule doesn't nat any ip and just
 	 * redirect?
 	 * do we set aliasaddress to 0.0.0.0?
 	 */
 	ptr->ip = ucfg->ip;
 	ptr->redir_cnt = ucfg->redir_cnt;
 	ptr->mode = ucfg->mode;
 	ptr->alias_port_lo = ucfg->alias_port_lo;
 	ptr->alias_port_hi = ucfg->alias_port_hi;
 	strlcpy(ptr->if_name, ucfg->if_name, sizeof(ptr->if_name));
 	LibAliasSetMode(ptr->lib, ptr->mode, ~0);
 	LibAliasSetAddress(ptr->lib, ptr->ip);
 	LibAliasSetAliasPortRange(ptr->lib, ptr->alias_port_lo, ptr->alias_port_hi);
 
 	/*
 	 * Redir and LSNAT configuration.
 	 */
 	/* Delete old cfgs. */
 	del_redir_spool_cfg(ptr, &ptr->redir_chain);
 	/* Add new entries. */
 	add_redir_spool_cfg((char *)(ucfg + 1), ptr);
 	IPFW_UH_WLOCK(chain);
 
 	/* Extra check to avoid race with another ipfw_nat_cfg() */
 	tcfg = NULL;
 	if (gencnt != chain->gencnt)
 	    tcfg = lookup_nat_name(&chain->nat, ucfg->name);
 	IPFW_WLOCK(chain);
 	if (tcfg != NULL)
 		LIST_REMOVE(tcfg, _next);
 	LIST_INSERT_HEAD(&chain->nat, ptr, _next);
 	IPFW_WUNLOCK(chain);
 	chain->gencnt++;
 
 	IPFW_UH_WUNLOCK(chain);
 
 	if (tcfg != NULL)
 		free_nat_instance(ptr);
 }
 
 /*
  * Creates/configure nat44 instance
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header nat44_cfg_nat .. ]
  *
  * Returns 0 on success
  */
 static int
 nat44_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct nat44_cfg_nat *ucfg;
 	int id;
 	size_t read;
 	char *errptr;
 
 	/* Check minimum header size */
 	if (sd->valsize < (sizeof(*oh) + sizeof(*ucfg)))
 		return (EINVAL);
 
 	oh = (ipfw_obj_header *)sd->kbuf;
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ucfg = (struct nat44_cfg_nat *)(oh + 1);
 
 	/* Check if name is properly terminated and looks like number */
 	if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
 		return (EINVAL);
 	id = strtol(ucfg->name, &errptr, 10);
 	if (id == 0 || *errptr != '\0')
 		return (EINVAL);
 
 	read = sizeof(*oh) + sizeof(*ucfg);
 	/* Check number of redirs */
 	if (sd->valsize < read + ucfg->redir_cnt*sizeof(struct nat44_cfg_redir))
 		return (EINVAL);
 
 	nat44_config(chain, ucfg);
 	return (0);
 }
 
 /*
  * Destroys given nat instances.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ]
  *
  * Returns 0 on success
  */
 static int
 nat44_destroy(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct cfg_nat *ptr;
 	ipfw_obj_ntlv *ntlv;
 
 	/* Check minimum header size */
 	if (sd->valsize < sizeof(*oh))
 		return (EINVAL);
 
 	oh = (ipfw_obj_header *)sd->kbuf;
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ntlv = &oh->ntlv;
 	/* Check if name is properly terminated */
 	if (strnlen(ntlv->name, sizeof(ntlv->name)) == sizeof(ntlv->name))
 		return (EINVAL);
 
 	IPFW_UH_WLOCK(chain);
 	ptr = lookup_nat_name(&chain->nat, ntlv->name);
 	if (ptr == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		return (ESRCH);
 	}
 	IPFW_WLOCK(chain);
 	LIST_REMOVE(ptr, _next);
 	flush_nat_ptrs(chain, ptr->id);
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 
 	free_nat_instance(ptr);
 
 	return (0);
 }
 
 static void
 export_nat_cfg(struct cfg_nat *ptr, struct nat44_cfg_nat *ucfg)
 {
 
 	snprintf(ucfg->name, sizeof(ucfg->name), "%d", ptr->id);
 	ucfg->ip = ptr->ip;
 	ucfg->redir_cnt = ptr->redir_cnt;
 	ucfg->mode = ptr->mode;
 	ucfg->alias_port_lo = ptr->alias_port_lo;
 	ucfg->alias_port_hi = ptr->alias_port_hi;
 	strlcpy(ucfg->if_name, ptr->if_name, sizeof(ucfg->if_name));
 }
 
 /*
  * Gets config for given nat instance
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header nat44_cfg_nat .. ]
  *
  * Returns 0 on success
  */
 static int
 nat44_get_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_nat *ptr;
 	struct cfg_redir *r;
 	struct cfg_spool *s;
 	struct nat44_cfg_redir *ser_r;
 	struct nat44_cfg_spool *ser_s;
 	size_t sz;
 
 	sz = sizeof(*oh) + sizeof(*ucfg);
 	/* Check minimum header size */
 	if (sd->valsize < sz)
 		return (EINVAL);
 
 	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ucfg = (struct nat44_cfg_nat *)(oh + 1);
 
 	/* Check if name is properly terminated */
 	if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	ptr = lookup_nat_name(&chain->nat, ucfg->name);
 	if (ptr == NULL) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ESRCH);
 	}
 
 	export_nat_cfg(ptr, ucfg);
 
 	/* Estimate memory amount */
 	sz = sizeof(ipfw_obj_header) + sizeof(struct nat44_cfg_nat);
 	LIST_FOREACH(r, &ptr->redir_chain, _next) {
 		sz += sizeof(struct nat44_cfg_redir);
 		LIST_FOREACH(s, &r->spool_chain, _next)
 			sz += sizeof(struct nat44_cfg_spool);
 	}
 
 	ucfg->size = sz;
 	if (sd->valsize < sz) {
 		/*
 		 * Submitted buffer size is not enough.
 		 * WE've already filled in @ucfg structure with
 		 * relevant info including size, so we
 		 * can return. Buffer will be flushed automatically.
 		 */
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	/* Size OK, let's copy data */
 	LIST_FOREACH(r, &ptr->redir_chain, _next) {
 		ser_r = (struct nat44_cfg_redir *)ipfw_get_sopt_space(sd,
 		    sizeof(*ser_r));
 		ser_r->mode = r->mode;
 		ser_r->laddr = r->laddr;
 		ser_r->paddr = r->paddr;
 		ser_r->raddr = r->raddr;
 		ser_r->lport = r->lport;
 		ser_r->pport = r->pport;
 		ser_r->rport = r->rport;
 		ser_r->pport_cnt = r->pport_cnt;
 		ser_r->rport_cnt = r->rport_cnt;
 		ser_r->proto = r->proto;
 		ser_r->spool_cnt = r->spool_cnt;
 
 		LIST_FOREACH(s, &r->spool_chain, _next) {
 			ser_s = (struct nat44_cfg_spool *)ipfw_get_sopt_space(
 			    sd, sizeof(*ser_s));
 
 			ser_s->addr = s->addr;
 			ser_s->port = s->port;
 		}
 	}
 
 	IPFW_UH_RUNLOCK(chain);
 
 	return (0);
 }
 
 /*
  * Lists all nat44 instances currently available in kernel.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ]
  * Reply: [ ipfw_obj_lheader nat44_cfg_nat x N ]
  *
  * Returns 0 on success
  */
 static int
 nat44_list_nat(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_lheader *olh;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_nat *ptr;
 	int nat_count;
 
 	/* Check minimum header size */
 	if (sd->valsize < sizeof(ipfw_obj_lheader))
 		return (EINVAL);
 
 	olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh));
 	IPFW_UH_RLOCK(chain);
 	nat_count = 0;
 	LIST_FOREACH(ptr, &chain->nat, _next)
 		nat_count++;
 
 	olh->count = nat_count;
 	olh->objsize = sizeof(struct nat44_cfg_nat);
 	olh->size = sizeof(*olh) + olh->count * olh->objsize;
 
 	if (sd->valsize < olh->size) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		ucfg = (struct nat44_cfg_nat *)ipfw_get_sopt_space(sd,
 		    sizeof(*ucfg));
 		export_nat_cfg(ptr, ucfg);
 	}
 
 	IPFW_UH_RUNLOCK(chain);
 
 	return (0);
 }
 
 /*
  * Gets log for given nat instance
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header nat44_cfg_nat ]
  * Reply: [ ipfw_obj_header nat44_cfg_nat LOGBUFFER ]
  *
  * Returns 0 on success
  */
 static int
 nat44_get_log(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_nat *ptr;
 	void *pbuf;
 	size_t sz;
 
 	sz = sizeof(*oh) + sizeof(*ucfg);
 	/* Check minimum header size */
 	if (sd->valsize < sz)
 		return (EINVAL);
 
 	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ucfg = (struct nat44_cfg_nat *)(oh + 1);
 
 	/* Check if name is properly terminated */
 	if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	ptr = lookup_nat_name(&chain->nat, ucfg->name);
 	if (ptr == NULL) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ESRCH);
 	}
 
 	if (ptr->lib->logDesc == NULL) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOENT);
 	}
 
 	export_nat_cfg(ptr, ucfg);
 
 	/* Estimate memory amount */
 	ucfg->size = sizeof(struct nat44_cfg_nat) + LIBALIAS_BUF_SIZE;
 	if (sd->valsize < sz + sizeof(*oh)) {
 		/*
 		 * Submitted buffer size is not enough.
 		 * WE've already filled in @ucfg structure with
 		 * relevant info including size, so we
 		 * can return. Buffer will be flushed automatically.
 		 */
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	pbuf = (void *)ipfw_get_sopt_space(sd, LIBALIAS_BUF_SIZE);
 	memcpy(pbuf, ptr->lib->logDesc, LIBALIAS_BUF_SIZE);
 
 	IPFW_UH_RUNLOCK(chain);
 
 	return (0);
 }
 
 static struct ipfw_sopt_handler	scodes[] = {
 	{ IP_FW_NAT44_XCONFIG,	0,	HDIR_SET,	nat44_cfg },
 	{ IP_FW_NAT44_DESTROY,	0,	HDIR_SET,	nat44_destroy },
 	{ IP_FW_NAT44_XGETCONFIG,	0,	HDIR_GET,	nat44_get_cfg },
 	{ IP_FW_NAT44_LIST_NAT,	0,	HDIR_GET,	nat44_list_nat },
 	{ IP_FW_NAT44_XGETLOG,	0,	HDIR_GET,	nat44_get_log },
 };
 
 /*
  * Legacy configuration routines
  */
 
 struct cfg_spool_legacy {
 	LIST_ENTRY(cfg_spool_legacy)	_next;
 	struct in_addr			addr;
 	u_short				port;
 };
 
 struct cfg_redir_legacy {
 	LIST_ENTRY(cfg_redir)   _next;
 	u_int16_t               mode;
 	struct in_addr	        laddr;
 	struct in_addr	        paddr;
 	struct in_addr	        raddr;
 	u_short                 lport;
 	u_short                 pport;
 	u_short                 rport;
 	u_short                 pport_cnt;
 	u_short                 rport_cnt;
 	int                     proto;
 	struct alias_link       **alink;
 	u_int16_t               spool_cnt;
 	LIST_HEAD(, cfg_spool_legacy) spool_chain;
 };
 
 struct cfg_nat_legacy {
 	LIST_ENTRY(cfg_nat_legacy)	_next;
 	int				id;
 	struct in_addr			ip;
 	char				if_name[IF_NAMESIZE];
 	int				mode;
 	struct libalias			*lib;
 	int				redir_cnt;
 	LIST_HEAD(, cfg_redir_legacy)	redir_chain;
 };
 
 static int
 ipfw_nat_cfg(struct sockopt *sopt)
 {
 	struct cfg_nat_legacy *cfg;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_redir_legacy *rdir;
 	struct nat44_cfg_redir *urdir;
 	char *buf;
 	size_t len, len2;
 	int error, i;
 
 	len = sopt->sopt_valsize;
 	len2 = len + 128;
 
 	/*
 	 * Allocate 2x buffer to store converted structures.
 	 * new redir_cfg has shrunk, so we're sure that
 	 * new buffer size is enough.
 	 */
 	buf = malloc(roundup2(len, 8) + len2, M_TEMP, M_WAITOK | M_ZERO);
 	error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat_legacy));
 	if (error != 0)
 		goto out;
 
 	cfg = (struct cfg_nat_legacy *)buf;
 	if (cfg->id < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	ucfg = (struct nat44_cfg_nat *)&buf[roundup2(len, 8)];
 	snprintf(ucfg->name, sizeof(ucfg->name), "%d", cfg->id);
 	strlcpy(ucfg->if_name, cfg->if_name, sizeof(ucfg->if_name));
 	ucfg->ip = cfg->ip;
 	ucfg->mode = cfg->mode;
 	ucfg->redir_cnt = cfg->redir_cnt;
 
 	if (len < sizeof(*cfg) + cfg->redir_cnt * sizeof(*rdir)) {
 		error = EINVAL;
 		goto out;
 	}
 
 	urdir = (struct nat44_cfg_redir *)(ucfg + 1);
 	rdir = (struct cfg_redir_legacy *)(cfg + 1);
 	for (i = 0; i < cfg->redir_cnt; i++) {
 		urdir->mode = rdir->mode;
 		urdir->laddr = rdir->laddr;
 		urdir->paddr = rdir->paddr;
 		urdir->raddr = rdir->raddr;
 		urdir->lport = rdir->lport;
 		urdir->pport = rdir->pport;
 		urdir->rport = rdir->rport;
 		urdir->pport_cnt = rdir->pport_cnt;
 		urdir->rport_cnt = rdir->rport_cnt;
 		urdir->proto = rdir->proto;
 		urdir->spool_cnt = rdir->spool_cnt;
 
 		urdir++;
 		rdir++;
 	}
 
 	nat44_config(&V_layer3_chain, ucfg);
 
 out:
 	free(buf, M_TEMP);
 	return (error);
 }
 
 static int
 ipfw_nat_del(struct sockopt *sopt)
 {
 	struct cfg_nat *ptr;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 	int i;
 
 	sooptcopyin(sopt, &i, sizeof i, sizeof i);
 	/* XXX validate i */
 	IPFW_UH_WLOCK(chain);
 	ptr = lookup_nat(&chain->nat, i);
 	if (ptr == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		return (EINVAL);
 	}
 	IPFW_WLOCK(chain);
 	LIST_REMOVE(ptr, _next);
 	flush_nat_ptrs(chain, i);
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 	free_nat_instance(ptr);
 	return (0);
 }
 
 static int
 ipfw_nat_get_cfg(struct sockopt *sopt)
 {
 	struct ip_fw_chain *chain = &V_layer3_chain;
 	struct cfg_nat *n;
 	struct cfg_nat_legacy *ucfg;
 	struct cfg_redir *r;
 	struct cfg_spool *s;
 	struct cfg_redir_legacy *ser_r;
 	struct cfg_spool_legacy *ser_s;
 	char *data;
 	int gencnt, nat_cnt, len, error;
 
 	nat_cnt = 0;
 	len = sizeof(nat_cnt);
 
 	IPFW_UH_RLOCK(chain);
 retry:
 	gencnt = chain->gencnt;
 	/* Estimate memory amount */
 	LIST_FOREACH(n, &chain->nat, _next) {
 		nat_cnt++;
 		len += sizeof(struct cfg_nat_legacy);
 		LIST_FOREACH(r, &n->redir_chain, _next) {
 			len += sizeof(struct cfg_redir_legacy);
 			LIST_FOREACH(s, &r->spool_chain, _next)
 				len += sizeof(struct cfg_spool_legacy);
 		}
 	}
 	IPFW_UH_RUNLOCK(chain);
 
 	data = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
 	bcopy(&nat_cnt, data, sizeof(nat_cnt));
 
 	nat_cnt = 0;
 	len = sizeof(nat_cnt);
 
 	IPFW_UH_RLOCK(chain);
 	if (gencnt != chain->gencnt) {
 		free(data, M_TEMP);
 		goto retry;
 	}
 	/* Serialize all the data. */
 	LIST_FOREACH(n, &chain->nat, _next) {
 		ucfg = (struct cfg_nat_legacy *)&data[len];
 		ucfg->id = n->id;
 		ucfg->ip = n->ip;
 		ucfg->redir_cnt = n->redir_cnt;
 		ucfg->mode = n->mode;
 		strlcpy(ucfg->if_name, n->if_name, sizeof(ucfg->if_name));
 		len += sizeof(struct cfg_nat_legacy);
 		LIST_FOREACH(r, &n->redir_chain, _next) {
 			ser_r = (struct cfg_redir_legacy *)&data[len];
 			ser_r->mode = r->mode;
 			ser_r->laddr = r->laddr;
 			ser_r->paddr = r->paddr;
 			ser_r->raddr = r->raddr;
 			ser_r->lport = r->lport;
 			ser_r->pport = r->pport;
 			ser_r->rport = r->rport;
 			ser_r->pport_cnt = r->pport_cnt;
 			ser_r->rport_cnt = r->rport_cnt;
 			ser_r->proto = r->proto;
 			ser_r->spool_cnt = r->spool_cnt;
 			len += sizeof(struct cfg_redir_legacy);
 			LIST_FOREACH(s, &r->spool_chain, _next) {
 				ser_s = (struct cfg_spool_legacy *)&data[len];
 				ser_s->addr = s->addr;
 				ser_s->port = s->port;
 				len += sizeof(struct cfg_spool_legacy);
 			}
 		}
 	}
 	IPFW_UH_RUNLOCK(chain);
 
 	error = sooptcopyout(sopt, data, len);
 	free(data, M_TEMP);
 
 	return (error);
 }
 
 static int
 ipfw_nat_get_log(struct sockopt *sopt)
 {
 	uint8_t *data;
 	struct cfg_nat *ptr;
 	int i, size;
 	struct ip_fw_chain *chain;
 	IPFW_RLOCK_TRACKER;
 
 	chain = &V_layer3_chain;
 
 	IPFW_RLOCK(chain);
 	/* one pass to count, one to copy the data */
 	i = 0;
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		if (ptr->lib->logDesc == NULL)
 			continue;
 		i++;
 	}
 	size = i * (LIBALIAS_BUF_SIZE + sizeof(int));
 	data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO);
 	if (data == NULL) {
 		IPFW_RUNLOCK(chain);
 		return (ENOSPC);
 	}
 	i = 0;
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		if (ptr->lib->logDesc == NULL)
 			continue;
 		bcopy(&ptr->id, &data[i], sizeof(int));
 		i += sizeof(int);
 		bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE);
 		i += LIBALIAS_BUF_SIZE;
 	}
 	IPFW_RUNLOCK(chain);
 	sooptcopyout(sopt, data, size);
 	free(data, M_IPFW);
 	return(0);
 }
 
 static int
 vnet_ipfw_nat_init(const void *arg __unused)
 {
 
 	V_ipfw_nat_ready = 1;
 	return (0);
 }
 
 static int
 vnet_ipfw_nat_uninit(const void *arg __unused)
 {
 	struct cfg_nat *ptr, *ptr_temp;
 	struct ip_fw_chain *chain;
 
 	chain = &V_layer3_chain;
 	IPFW_WLOCK(chain);
 	V_ipfw_nat_ready = 0;
 	LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) {
 		LIST_REMOVE(ptr, _next);
 		free_nat_instance(ptr);
 	}
 	flush_nat_ptrs(chain, -1 /* flush all */);
 	IPFW_WUNLOCK(chain);
 	return (0);
 }
 
 static void
 ipfw_nat_init(void)
 {
 
 	/* init ipfw hooks */
 	ipfw_nat_ptr = ipfw_nat;
 	lookup_nat_ptr = lookup_nat;
 	ipfw_nat_cfg_ptr = ipfw_nat_cfg;
 	ipfw_nat_del_ptr = ipfw_nat_del;
 	ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
 	ipfw_nat_get_log_ptr = ipfw_nat_get_log;
 	IPFW_ADD_SOPT_HANDLER(1, scodes);
 
 	ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change,
 	    NULL, EVENTHANDLER_PRI_ANY);
 }
 
 static void
 ipfw_nat_destroy(void)
 {
 
 	EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag);
 	/* deregister ipfw_nat */
 	IPFW_DEL_SOPT_HANDLER(1, scodes);
 	ipfw_nat_ptr = NULL;
 	lookup_nat_ptr = NULL;
 	ipfw_nat_cfg_ptr = NULL;
 	ipfw_nat_del_ptr = NULL;
 	ipfw_nat_get_cfg_ptr = NULL;
 	ipfw_nat_get_log_ptr = NULL;
 }
 
 static int
 ipfw_nat_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		break;
 
 	case MOD_UNLOAD:
 		break;
 
 	default:
 		return EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipfw_nat_mod = {
 	"ipfw_nat",
 	ipfw_nat_modevent,
 	0
 };
 
 /* Define startup order. */
 #define	IPFW_NAT_SI_SUB_FIREWALL	SI_SUB_PROTO_FIREWALL
 #define	IPFW_NAT_MODEVENT_ORDER		(SI_ORDER_ANY - 128) /* after ipfw */
 #define	IPFW_NAT_MODULE_ORDER		(IPFW_NAT_MODEVENT_ORDER + 1)
 #define	IPFW_NAT_VNET_ORDER		(IPFW_NAT_MODEVENT_ORDER + 2)
 
 DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, IPFW_NAT_SI_SUB_FIREWALL, SI_ORDER_ANY);
 MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
 MODULE_DEPEND(ipfw_nat, ipfw, 3, 3, 3);
 MODULE_VERSION(ipfw_nat, 1);
 
 SYSINIT(ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER,
     ipfw_nat_init, NULL);
 VNET_SYSINIT(vnet_ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_VNET_ORDER,
     vnet_ipfw_nat_init, NULL);
 
 SYSUNINIT(ipfw_nat_destroy, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER,
     ipfw_nat_destroy, NULL);
 VNET_SYSUNINIT(vnet_ipfw_nat_uninit, IPFW_NAT_SI_SUB_FIREWALL,
     IPFW_NAT_VNET_ORDER, vnet_ipfw_nat_uninit, NULL);
 
 /* end of file */
diff --git a/sys/netpfil/ipfw/nat64/nat64_translate.c b/sys/netpfil/ipfw/nat64/nat64_translate.c
index fecc8ff334d2..4b4fc16a5ba6 100644
--- a/sys/netpfil/ipfw/nat64/nat64_translate.c
+++ b/sys/netpfil/ipfw/nat64/nat64_translate.c
@@ -1,1722 +1,1723 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2015-2019 Yandex LLC
  * Copyright (c) 2015-2019 Andrey V. Elsukov <ae@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipstealth.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/queue.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_pflog.h>
 #include <net/pfil.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/ip_fw_nat64.h>
 
 #include <netpfil/pf/pf.h>
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <machine/in_cksum.h>
 
 #include "ip_fw_nat64.h"
 #include "nat64_translate.h"
 
 typedef int (*nat64_output_t)(struct ifnet *, struct mbuf *,
     struct sockaddr *, struct nat64_counters *, void *);
 typedef int (*nat64_output_one_t)(struct mbuf *, struct nat64_counters *,
     void *);
 
 static struct nhop_object *nat64_find_route4(struct sockaddr_in *,
     struct mbuf *);
 static struct nhop_object *nat64_find_route6(struct sockaddr_in6 *,
     struct mbuf *);
 static int nat64_output_one(struct mbuf *, struct nat64_counters *, void *);
 static int nat64_output(struct ifnet *, struct mbuf *, struct sockaddr *,
     struct nat64_counters *, void *);
 static int nat64_direct_output_one(struct mbuf *, struct nat64_counters *,
     void *);
 static int nat64_direct_output(struct ifnet *, struct mbuf *,
     struct sockaddr *, struct nat64_counters *, void *);
 
 struct nat64_methods {
 	nat64_output_t		output;
 	nat64_output_one_t	output_one;
 };
 static const struct nat64_methods nat64_netisr = {
 	.output = nat64_output,
 	.output_one = nat64_output_one
 };
 static const struct nat64_methods nat64_direct = {
 	.output = nat64_direct_output,
 	.output_one = nat64_direct_output_one
 };
 
 /* These variables should be initialized explicitly on module loading */
 VNET_DEFINE_STATIC(const struct nat64_methods *, nat64out);
 VNET_DEFINE_STATIC(const int *, nat64ipstealth);
 VNET_DEFINE_STATIC(const int *, nat64ip6stealth);
 #define	V_nat64out		VNET(nat64out)
 #define	V_nat64ipstealth	VNET(nat64ipstealth)
 #define	V_nat64ip6stealth	VNET(nat64ip6stealth)
 
 static const int stealth_on = 1;
 #ifndef IPSTEALTH
 static const int stealth_off = 0;
 #endif
 
 void
 nat64_set_output_method(int direct)
 {
 
 	if (direct != 0) {
 		V_nat64out = &nat64_direct;
 #ifdef IPSTEALTH
 		/* Honor corresponding variables, if IPSTEALTH is defined */
 		V_nat64ipstealth = &V_ipstealth;
 		V_nat64ip6stealth = &V_ip6stealth;
 #else
 		/* otherwise we need to decrement HLIM/TTL for direct case */
 		V_nat64ipstealth = V_nat64ip6stealth = &stealth_off;
 #endif
 	} else {
 		V_nat64out = &nat64_netisr;
 		/* Leave TTL/HLIM decrementing to forwarding code */
 		V_nat64ipstealth = V_nat64ip6stealth = &stealth_on;
 	}
 }
 
 int
 nat64_get_output_method(void)
 {
 
 	return (V_nat64out == &nat64_direct ? 1: 0);
 }
 
 static void
 nat64_log(struct pfloghdr *logdata, struct mbuf *m, sa_family_t family)
 {
 
 	logdata->dir = PF_OUT;
 	logdata->af = family;
 	ipfw_bpf_mtap2(logdata, PFLOG_HDRLEN, m);
 }
 
 static int
 nat64_direct_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
     struct nat64_counters *stats, void *logdata)
 {
 	int error;
 
 	if (logdata != NULL)
 		nat64_log(logdata, m, dst->sa_family);
 	error = (*ifp->if_output)(ifp, m, dst, NULL);
 	if (error != 0)
 		NAT64STAT_INC(stats, oerrors);
 	return (error);
 }
 
 static int
 nat64_direct_output_one(struct mbuf *m, struct nat64_counters *stats,
     void *logdata)
 {
 	struct nhop_object *nh4 = NULL;
 	struct nhop_object *nh6 = NULL;
 	struct sockaddr_in6 dst6;
 	struct sockaddr_in dst4;
 	struct sockaddr *dst;
 	struct ip6_hdr *ip6;
 	struct ip *ip4;
 	struct ifnet *ifp;
 	int error;
 
 	ip4 = mtod(m, struct ip *);
 	error = 0;
 	switch (ip4->ip_v) {
 	case IPVERSION:
 		dst4.sin_addr = ip4->ip_dst;
 		nh4 = nat64_find_route4(&dst4, m);
 		if (nh4 == NULL) {
 			NAT64STAT_INC(stats, noroute4);
 			error = EHOSTUNREACH;
 		} else {
 			ifp = nh4->nh_ifp;
 			dst = (struct sockaddr *)&dst4;
 		}
 		break;
 	case (IPV6_VERSION >> 4):
 		ip6 = mtod(m, struct ip6_hdr *);
 		dst6.sin6_addr = ip6->ip6_dst;
 		nh6 = nat64_find_route6(&dst6, m);
 		if (nh6 == NULL) {
 			NAT64STAT_INC(stats, noroute6);
 			error = EHOSTUNREACH;
 		} else {
 			ifp = nh6->nh_ifp;
 			dst = (struct sockaddr *)&dst6;
 		}
 		break;
 	default:
 		m_freem(m);
 		NAT64STAT_INC(stats, dropped);
 		DPRINTF(DP_DROPS, "dropped due to unknown IP version");
 		return (EAFNOSUPPORT);
 	}
 	if (error != 0) {
 		m_freem(m);
 		return (EHOSTUNREACH);
 	}
 	if (logdata != NULL)
 		nat64_log(logdata, m, dst->sa_family);
 	error = (*ifp->if_output)(ifp, m, dst, NULL);
 	if (error != 0)
 		NAT64STAT_INC(stats, oerrors);
 	return (error);
 }
 
 static int
 nat64_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst,
     struct nat64_counters *stats, void *logdata)
 {
 	struct ip *ip4;
 	int ret, af;
 
 	ip4 = mtod(m, struct ip *);
 	switch (ip4->ip_v) {
 	case IPVERSION:
 		af = AF_INET;
 		ret = NETISR_IP;
 		break;
 	case (IPV6_VERSION >> 4):
 		af = AF_INET6;
 		ret = NETISR_IPV6;
 		break;
 	default:
 		m_freem(m);
 		NAT64STAT_INC(stats, dropped);
 		DPRINTF(DP_DROPS, "unknown IP version");
 		return (EAFNOSUPPORT);
 	}
 	if (logdata != NULL)
 		nat64_log(logdata, m, af);
 	if (m->m_pkthdr.rcvif == NULL)
 		m->m_pkthdr.rcvif = V_loif;
 	ret = netisr_queue(ret, m);
 	if (ret != 0)
 		NAT64STAT_INC(stats, oerrors);
 	return (ret);
 }
 
 static int
 nat64_output_one(struct mbuf *m, struct nat64_counters *stats, void *logdata)
 {
 
 	return (nat64_output(NULL, m, NULL, stats, logdata));
 }
 
 /*
  * Check the given IPv6 prefix and length according to RFC6052:
  *   The prefixes can only have one of the following lengths:
  *   32, 40, 48, 56, 64, or 96 (The Well-Known Prefix is 96 bits long).
  * Returns zero on success, otherwise EINVAL.
  */
 int
 nat64_check_prefixlen(int length)
 {
 
 	switch (length) {
 	case 32:
 	case 40:
 	case 48:
 	case 56:
 	case 64:
 	case 96:
 		return (0);
 	}
 	return (EINVAL);
 }
 
 int
 nat64_check_prefix6(const struct in6_addr *prefix, int length)
 {
 
 	if (nat64_check_prefixlen(length) != 0)
 		return (EINVAL);
 
 	/* Well-known prefix has 96 prefix length */
 	if (IN6_IS_ADDR_WKPFX(prefix) && length != 96)
 		return (EINVAL);
 
 	/* Bits 64 to 71 must be set to zero */
 	if (prefix->__u6_addr.__u6_addr8[8] != 0)
 		return (EINVAL);
 
 	/* Some extra checks */
 	if (IN6_IS_ADDR_MULTICAST(prefix) ||
 	    IN6_IS_ADDR_UNSPECIFIED(prefix) ||
 	    IN6_IS_ADDR_LOOPBACK(prefix))
 		return (EINVAL);
 	return (0);
 }
 
 int
 nat64_check_private_ip4(const struct nat64_config *cfg, in_addr_t ia)
 {
 
 	if (cfg->flags & NAT64_ALLOW_PRIVATE)
 		return (0);
 
 	/* WKPFX must not be used to represent non-global IPv4 addresses */
 	if (cfg->flags & NAT64_WKPFX) {
 		/* IN_PRIVATE */
 		if ((ia & htonl(0xff000000)) == htonl(0x0a000000) ||
 		    (ia & htonl(0xfff00000)) == htonl(0xac100000) ||
 		    (ia & htonl(0xffff0000)) == htonl(0xc0a80000))
 			return (1);
 		/*
 		 * RFC 5735:
 		 *  192.0.0.0/24 - reserved for IETF protocol assignments
 		 *  192.88.99.0/24 - for use as 6to4 relay anycast addresses
 		 *  198.18.0.0/15 - for use in benchmark tests
 		 *  192.0.2.0/24, 198.51.100.0/24, 203.0.113.0/24 - for use
 		 *   in documentation and example code
 		 */
 		if ((ia & htonl(0xffffff00)) == htonl(0xc0000000) ||
 		    (ia & htonl(0xffffff00)) == htonl(0xc0586300) ||
 		    (ia & htonl(0xfffffe00)) == htonl(0xc6120000) ||
 		    (ia & htonl(0xffffff00)) == htonl(0xc0000200) ||
 		    (ia & htonl(0xfffffe00)) == htonl(0xc6336400) ||
 		    (ia & htonl(0xffffff00)) == htonl(0xcb007100))
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Embed @ia IPv4 address into @ip6 IPv6 address.
  * Place to embedding determined from prefix length @plen.
  */
 void
 nat64_embed_ip4(struct in6_addr *ip6, int plen, in_addr_t ia)
 {
 
 	switch (plen) {
 	case 32:
 	case 96:
 		ip6->s6_addr32[plen / 32] = ia;
 		break;
 	case 40:
 	case 48:
 	case 56:
 		/*
 		 * Preserve prefix bits.
 		 * Since suffix bits should be zero and reserved for future
 		 * use, we just overwrite the whole word, where they are.
 		 */
 		ip6->s6_addr32[1] &= 0xffffffff << (32 - plen % 32);
 #if BYTE_ORDER == BIG_ENDIAN
 		ip6->s6_addr32[1] |= ia >> (plen % 32);
 		ip6->s6_addr32[2] = ia << (24 - plen % 32);
 #elif BYTE_ORDER == LITTLE_ENDIAN
 		ip6->s6_addr32[1] |= ia << (plen % 32);
 		ip6->s6_addr32[2] = ia >> (24 - plen % 32);
 #endif
 		break;
 	case 64:
 #if BYTE_ORDER == BIG_ENDIAN
 		ip6->s6_addr32[2] = ia >> 8;
 		ip6->s6_addr32[3] = ia << 24;
 #elif BYTE_ORDER == LITTLE_ENDIAN
 		ip6->s6_addr32[2] = ia << 8;
 		ip6->s6_addr32[3] = ia >> 24;
 #endif
 		break;
 	default:
 		panic("Wrong plen: %d", plen);
 	};
 	/*
 	 * Bits 64 to 71 of the address are reserved for compatibility
 	 * with the host identifier format defined in the IPv6 addressing
 	 * architecture [RFC4291]. These bits MUST be set to zero.
 	 */
 	ip6->s6_addr8[8] = 0;
 }
 
 in_addr_t
 nat64_extract_ip4(const struct in6_addr *ip6, int plen)
 {
 	in_addr_t ia;
 
 	/*
 	 * According to RFC 6052 p2.2:
 	 * IPv4-embedded IPv6 addresses are composed of a variable-length
 	 * prefix, the embedded IPv4 address, and a variable length suffix.
 	 * The suffix bits are reserved for future extensions and SHOULD
 	 * be set to zero.
 	 */
 	switch (plen) {
 	case 32:
 		if (ip6->s6_addr32[3] != 0 || ip6->s6_addr32[2] != 0)
 			goto badip6;
 		break;
 	case 40:
 		if (ip6->s6_addr32[3] != 0 ||
 		    (ip6->s6_addr32[2] & htonl(0xff00ffff)) != 0)
 			goto badip6;
 		break;
 	case 48:
 		if (ip6->s6_addr32[3] != 0 ||
 		    (ip6->s6_addr32[2] & htonl(0xff0000ff)) != 0)
 			goto badip6;
 		break;
 	case 56:
 		if (ip6->s6_addr32[3] != 0 || ip6->s6_addr8[8] != 0)
 			goto badip6;
 		break;
 	case 64:
 		if (ip6->s6_addr8[8] != 0 ||
 		    (ip6->s6_addr32[3] & htonl(0x00ffffff)) != 0)
 			goto badip6;
 	};
 	switch (plen) {
 	case 32:
 	case 96:
 		ia = ip6->s6_addr32[plen / 32];
 		break;
 	case 40:
 	case 48:
 	case 56:
 #if BYTE_ORDER == BIG_ENDIAN
 		ia = (ip6->s6_addr32[1] << (plen % 32)) |
 		    (ip6->s6_addr32[2] >> (24 - plen % 32));
 #elif BYTE_ORDER == LITTLE_ENDIAN
 		ia = (ip6->s6_addr32[1] >> (plen % 32)) |
 		    (ip6->s6_addr32[2] << (24 - plen % 32));
 #endif
 		break;
 	case 64:
 #if BYTE_ORDER == BIG_ENDIAN
 		ia = (ip6->s6_addr32[2] << 8) | (ip6->s6_addr32[3] >> 24);
 #elif BYTE_ORDER == LITTLE_ENDIAN
 		ia = (ip6->s6_addr32[2] >> 8) | (ip6->s6_addr32[3] << 24);
 #endif
 		break;
 	default:
 		return (0);
 	};
 	if (nat64_check_ip4(ia) == 0)
 		return (ia);
 
 	DPRINTF(DP_GENERIC | DP_DROPS,
 	    "invalid destination address: %08x", ia);
 	return (0);
 badip6:
 	DPRINTF(DP_GENERIC | DP_DROPS, "invalid IPv4-embedded IPv6 address");
 	return (0);
 }
 
 /*
  * According to RFC 1624 the equation for incremental checksum update is:
  *	HC' = ~(~HC + ~m + m')	--	[Eqn. 3]
  *	HC' = HC - ~m - m'	--	[Eqn. 4]
  * So, when we are replacing IPv4 addresses to IPv6, we
  * can assume, that new bytes previously were zeros, and vise versa -
  * when we replacing IPv6 addresses to IPv4, now unused bytes become
  * zeros. The payload length in pseudo header has bigger size, but one
  * half of it should be zero. Using the equation 4 we get:
  *	HC' = HC - (~m0 + m0')	-- m0 is first changed word
  *	HC' = (HC - (~m0 + m0')) - (~m1 + m1')	-- m1 is second changed word
  *	HC' = HC - ~m0 - m0' - ~m1 - m1' - ... =
  *	  = HC - sum(~m[i] + m'[i])
  *
  * The function result should be used as follows:
  *	IPv6 to IPv4:	HC' = cksum_add(HC, result)
  *	IPv4 to IPv6:	HC' = cksum_add(HC, ~result)
  */
 static uint16_t
 nat64_cksum_convert(struct ip6_hdr *ip6, struct ip *ip)
 {
 	uint32_t sum;
 	uint16_t *p;
 
 	sum = ~ip->ip_src.s_addr >> 16;
 	sum += ~ip->ip_src.s_addr & 0xffff;
 	sum += ~ip->ip_dst.s_addr >> 16;
 	sum += ~ip->ip_dst.s_addr & 0xffff;
 
 	for (p = (uint16_t *)&ip6->ip6_src;
 	    p < (uint16_t *)(&ip6->ip6_src + 2); p++)
 		sum += *p;
 
 	while (sum >> 16)
 		sum = (sum & 0xffff) + (sum >> 16);
 	return (sum);
 }
 
 static void
 nat64_init_ip4hdr(const struct ip6_hdr *ip6, const struct ip6_frag *frag,
     uint16_t plen, uint8_t proto, struct ip *ip)
 {
 
 	/* assume addresses are already initialized */
 	ip->ip_v = IPVERSION;
 	ip->ip_hl = sizeof(*ip) >> 2;
 	ip->ip_tos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 	ip->ip_len = htons(sizeof(*ip) + plen);
 	ip->ip_ttl = ip6->ip6_hlim;
 	if (*V_nat64ip6stealth == 0)
 		ip->ip_ttl -= IPV6_HLIMDEC;
 	ip->ip_sum = 0;
 	ip->ip_p = (proto == IPPROTO_ICMPV6) ? IPPROTO_ICMP: proto;
 	ip_fillid(ip);
 	if (frag != NULL) {
 		ip->ip_off = htons(ntohs(frag->ip6f_offlg) >> 3);
 		if (frag->ip6f_offlg & IP6F_MORE_FRAG)
 			ip->ip_off |= htons(IP_MF);
 	} else {
 		ip->ip_off = htons(IP_DF);
 	}
 	ip->ip_sum = in_cksum_hdr(ip);
 }
 
 #define	FRAGSZ(mtu) ((mtu) - sizeof(struct ip6_hdr) - sizeof(struct ip6_frag))
 static NAT64NOINLINE int
 nat64_fragment6(struct nat64_counters *stats, struct ip6_hdr *ip6,
     struct mbufq *mq, struct mbuf *m, uint32_t mtu, uint16_t ip_id,
     uint16_t ip_off)
 {
 	struct ip6_frag ip6f;
 	struct mbuf *n;
 	uint16_t hlen, len, offset;
 	int plen;
 
 	plen = ntohs(ip6->ip6_plen);
 	hlen = sizeof(struct ip6_hdr);
 
 	/* Fragmentation isn't needed */
 	if (ip_off == 0 && plen <= mtu - hlen) {
 		M_PREPEND(m, hlen, M_NOWAIT);
 		if (m == NULL) {
 			NAT64STAT_INC(stats, nomem);
 			return (ENOMEM);
 		}
 		bcopy(ip6, mtod(m, void *), hlen);
 		if (mbufq_enqueue(mq, m) != 0) {
 			m_freem(m);
 			NAT64STAT_INC(stats, dropped);
 			DPRINTF(DP_DROPS, "dropped due to mbufq overflow");
 			return (ENOBUFS);
 		}
 		return (0);
 	}
 
 	hlen += sizeof(struct ip6_frag);
 	ip6f.ip6f_reserved = 0;
 	ip6f.ip6f_nxt = ip6->ip6_nxt;
 	ip6->ip6_nxt = IPPROTO_FRAGMENT;
 	if (ip_off != 0) {
 		/*
 		 * We have got an IPv4 fragment.
 		 * Use offset value and ip_id from original fragment.
 		 */
 		ip6f.ip6f_ident = htonl(ntohs(ip_id));
 		offset = (ntohs(ip_off) & IP_OFFMASK) << 3;
 		NAT64STAT_INC(stats, ifrags);
 	} else {
 		/* The packet size exceeds interface MTU */
 		ip6f.ip6f_ident = htonl(ip6_randomid());
 		offset = 0; /* First fragment*/
 	}
 	while (plen > 0 && m != NULL) {
 		n = NULL;
 		len = FRAGSZ(mtu) & ~7;
 		if (len > plen)
 			len = plen;
 		ip6->ip6_plen = htons(len + sizeof(ip6f));
 		ip6f.ip6f_offlg = ntohs(offset);
 		if (len < plen || (ip_off & htons(IP_MF)) != 0)
 			ip6f.ip6f_offlg |= IP6F_MORE_FRAG;
 		offset += len;
 		plen -= len;
 		if (plen > 0) {
 			n = m_split(m, len, M_NOWAIT);
 			if (n == NULL)
 				goto fail;
 		}
 		M_PREPEND(m, hlen, M_NOWAIT);
 		if (m == NULL)
 			goto fail;
 		bcopy(ip6, mtod(m, void *), sizeof(struct ip6_hdr));
 		bcopy(&ip6f, mtodo(m, sizeof(struct ip6_hdr)),
 		    sizeof(struct ip6_frag));
 		if (mbufq_enqueue(mq, m) != 0)
 			goto fail;
 		m = n;
 	}
 	NAT64STAT_ADD(stats, ofrags, mbufq_len(mq));
 	return (0);
 fail:
 	if (m != NULL)
 		m_freem(m);
 	if (n != NULL)
 		m_freem(n);
 	mbufq_drain(mq);
 	NAT64STAT_INC(stats, nomem);
 	return (ENOMEM);
 }
 
 static struct nhop_object *
 nat64_find_route6(struct sockaddr_in6 *dst, struct mbuf *m)
 {
 	struct nhop_object *nh;
 
 	NET_EPOCH_ASSERT();
 	nh = fib6_lookup(M_GETFIB(m), &dst->sin6_addr, 0, NHR_NONE, 0);
 	if (nh == NULL)
 		return (NULL);
 	if (nh->nh_flags & (NHF_BLACKHOLE | NHF_REJECT))
 		return (NULL);
 
 	dst->sin6_family = AF_INET6;
 	dst->sin6_len = sizeof(*dst);
 	if (nh->nh_flags & NHF_GATEWAY)
 		dst->sin6_addr = nh->gw6_sa.sin6_addr;
 	dst->sin6_port = 0;
 	dst->sin6_scope_id = 0;
 	dst->sin6_flowinfo = 0;
 	return (nh);
 }
 
 #define	NAT64_ICMP6_PLEN	64
 static NAT64NOINLINE void
 nat64_icmp6_reflect(struct mbuf *m, uint8_t type, uint8_t code, uint32_t mtu,
     struct nat64_counters *stats, void *logdata)
 {
 	struct icmp6_hdr *icmp6;
 	struct ip6_hdr *ip6, *oip6;
 	struct mbuf *n;
 	int len, plen, proto;
 
 	len = 0;
 	proto = nat64_getlasthdr(m, &len);
 	if (proto < 0) {
 		DPRINTF(DP_DROPS, "mbuf isn't contigious");
 		goto freeit;
 	}
 	/*
 	 * Do not send ICMPv6 in reply to ICMPv6 errors.
 	 */
 	if (proto == IPPROTO_ICMPV6) {
 		if (m->m_len < len + sizeof(*icmp6)) {
 			DPRINTF(DP_DROPS, "mbuf isn't contigious");
 			goto freeit;
 		}
 		icmp6 = mtodo(m, len);
 		if (icmp6->icmp6_type < ICMP6_ECHO_REQUEST ||
 		    icmp6->icmp6_type == ND_REDIRECT) {
 			DPRINTF(DP_DROPS, "do not send ICMPv6 in reply to "
 			    "ICMPv6 errors");
 			goto freeit;
 		}
 		/*
 		 * If there are extra headers between IPv6 and ICMPv6,
 		 * strip off them.
 		 */
 		if (len > sizeof(struct ip6_hdr)) {
 			/*
 			 * NOTE: ipfw_chk already did m_pullup() and it is
 			 * expected that data is contigious from the start
 			 * of IPv6 header up to the end of ICMPv6 header.
 			 */
 			bcopy(mtod(m, caddr_t),
 			    mtodo(m, len - sizeof(struct ip6_hdr)),
 			    sizeof(struct ip6_hdr));
 			m_adj(m, len - sizeof(struct ip6_hdr));
 		}
 	}
 	/*
 	if (icmp6_ratelimit(&ip6->ip6_src, type, code))
 		goto freeit;
 		*/
 	ip6 = mtod(m, struct ip6_hdr *);
 	switch (type) {
 	case ICMP6_DST_UNREACH:
 	case ICMP6_PACKET_TOO_BIG:
 	case ICMP6_TIME_EXCEEDED:
 	case ICMP6_PARAM_PROB:
 		break;
 	default:
 		goto freeit;
 	}
 	/* Calculate length of ICMPv6 payload */
 	len = (m->m_pkthdr.len > NAT64_ICMP6_PLEN) ? NAT64_ICMP6_PLEN:
 	    m->m_pkthdr.len;
 
 	/* Create new ICMPv6 datagram */
 	plen = len + sizeof(struct icmp6_hdr);
 	n = m_get2(sizeof(struct ip6_hdr) + plen + max_hdr, M_NOWAIT,
 	    MT_HEADER, M_PKTHDR);
 	if (n == NULL) {
 		NAT64STAT_INC(stats, nomem);
 		m_freem(m);
 		return;
 	}
 	/*
 	 * Move pkthdr from original mbuf. We should have initialized some
 	 * fields, because we can reinject this mbuf to netisr and it will
 	 * go through input path (it requires at least rcvif should be set).
 	 * Also do M_ALIGN() to reduce chances of need to allocate new mbuf
 	 * in the chain, when we will do M_PREPEND() or make some type of
 	 * tunneling.
 	 */
 	m_move_pkthdr(n, m);
 	M_ALIGN(n, sizeof(struct ip6_hdr) + plen + max_hdr);
 
 	n->m_len = n->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
 	oip6 = mtod(n, struct ip6_hdr *);
 	/*
 	 * Make IPv6 source address selection for reflected datagram.
 	 * nat64_check_ip6() doesn't allow scoped addresses, therefore
 	 * we use zero scopeid.
 	 */
 	if (in6_selectsrc_addr(M_GETFIB(n), &ip6->ip6_src, 0,
 	    n->m_pkthdr.rcvif, &oip6->ip6_src, NULL) != 0) {
 		/*
 		 * Failed to find proper source address, drop the packet.
 		 */
 		m_freem(n);
 		goto freeit;
 	}
 	oip6->ip6_dst = ip6->ip6_src;
 	oip6->ip6_nxt = IPPROTO_ICMPV6;
 	oip6->ip6_flow = 0;
 	oip6->ip6_vfc |= IPV6_VERSION;
 	oip6->ip6_hlim = V_ip6_defhlim;
 	oip6->ip6_plen = htons(plen);
 
 	icmp6 = mtodo(n, sizeof(struct ip6_hdr));
 	icmp6->icmp6_cksum = 0;
 	icmp6->icmp6_type = type;
 	icmp6->icmp6_code = code;
 	icmp6->icmp6_mtu = htonl(mtu);
 
 	m_copydata(m, 0, len, mtodo(n, sizeof(struct ip6_hdr) +
 	    sizeof(struct icmp6_hdr)));
 	icmp6->icmp6_cksum = in6_cksum(n, IPPROTO_ICMPV6,
 	    sizeof(struct ip6_hdr), plen);
 	m_freem(m);
 	V_nat64out->output_one(n, stats, logdata);
 	return;
 freeit:
 	NAT64STAT_INC(stats, dropped);
 	m_freem(m);
 }
 
 static struct nhop_object *
 nat64_find_route4(struct sockaddr_in *dst, struct mbuf *m)
 {
 	struct nhop_object *nh;
 
 	NET_EPOCH_ASSERT();
 	nh = fib4_lookup(M_GETFIB(m), dst->sin_addr, 0, NHR_NONE, 0);
 	if (nh == NULL)
 		return (NULL);
 	if (nh->nh_flags & (NHF_BLACKHOLE | NHF_BROADCAST | NHF_REJECT))
 		return (NULL);
 
 	dst->sin_family = AF_INET;
 	dst->sin_len = sizeof(*dst);
 	if (nh->nh_flags & NHF_GATEWAY)
 		dst->sin_addr = nh->gw4_sa.sin_addr;
 	dst->sin_port = 0;
 	return (nh);
 }
 
 #define	NAT64_ICMP_PLEN	64
 static NAT64NOINLINE void
 nat64_icmp_reflect(struct mbuf *m, uint8_t type,
     uint8_t code, uint16_t mtu, struct nat64_counters *stats, void *logdata)
 {
 	struct icmp *icmp;
 	struct ip *ip, *oip;
 	struct mbuf *n;
 	int len, plen;
 
 	ip = mtod(m, struct ip *);
 	/* Do not send ICMP error if packet is not the first fragment */
 	if (ip->ip_off & ~ntohs(IP_MF|IP_DF)) {
 		DPRINTF(DP_DROPS, "not first fragment");
 		goto freeit;
 	}
 	/* Do not send ICMP in reply to ICMP errors */
 	if (ip->ip_p == IPPROTO_ICMP) {
 		if (m->m_len < (ip->ip_hl << 2)) {
 			DPRINTF(DP_DROPS, "mbuf isn't contigious");
 			goto freeit;
 		}
 		icmp = mtodo(m, ip->ip_hl << 2);
 		if (!ICMP_INFOTYPE(icmp->icmp_type)) {
 			DPRINTF(DP_DROPS, "do not send ICMP in reply to "
 			    "ICMP errors");
 			goto freeit;
 		}
 	}
 	switch (type) {
 	case ICMP_UNREACH:
 	case ICMP_TIMXCEED:
 	case ICMP_PARAMPROB:
 		break;
 	default:
 		goto freeit;
 	}
 	/* Calculate length of ICMP payload */
 	len = (m->m_pkthdr.len > NAT64_ICMP_PLEN) ? (ip->ip_hl << 2) + 8:
 	    m->m_pkthdr.len;
 
 	/* Create new ICMPv4 datagram */
 	plen = len + sizeof(struct icmphdr) + sizeof(uint32_t);
 	n = m_get2(sizeof(struct ip) + plen + max_hdr, M_NOWAIT,
 	    MT_HEADER, M_PKTHDR);
 	if (n == NULL) {
 		NAT64STAT_INC(stats, nomem);
 		m_freem(m);
 		return;
 	}
 	m_move_pkthdr(n, m);
 	M_ALIGN(n, sizeof(struct ip) + plen + max_hdr);
 
 	n->m_len = n->m_pkthdr.len = sizeof(struct ip) + plen;
 	oip = mtod(n, struct ip *);
 	oip->ip_v = IPVERSION;
 	oip->ip_hl = sizeof(struct ip) >> 2;
 	oip->ip_tos = 0;
 	oip->ip_len = htons(n->m_pkthdr.len);
 	oip->ip_ttl = V_ip_defttl;
 	oip->ip_p = IPPROTO_ICMP;
 	ip_fillid(oip);
 	oip->ip_off = htons(IP_DF);
 	oip->ip_src = ip->ip_dst;
 	oip->ip_dst = ip->ip_src;
 	oip->ip_sum = 0;
 	oip->ip_sum = in_cksum_hdr(oip);
 
 	icmp = mtodo(n, sizeof(struct ip));
 	icmp->icmp_type = type;
 	icmp->icmp_code = code;
 	icmp->icmp_cksum = 0;
 	icmp->icmp_pmvoid = 0;
 	icmp->icmp_nextmtu = htons(mtu);
 	m_copydata(m, 0, len, mtodo(n, sizeof(struct ip) +
 	    sizeof(struct icmphdr) + sizeof(uint32_t)));
 	icmp->icmp_cksum = in_cksum_skip(n, sizeof(struct ip) + plen,
 	    sizeof(struct ip));
 	m_freem(m);
 	V_nat64out->output_one(n, stats, logdata);
 	return;
 freeit:
 	NAT64STAT_INC(stats, dropped);
 	m_freem(m);
 }
 
 /* Translate ICMP echo request/reply into ICMPv6 */
 static void
 nat64_icmp_handle_echo(struct ip6_hdr *ip6, struct icmp6_hdr *icmp6,
     uint16_t id, uint8_t type)
 {
 	uint16_t old;
 
 	old = *(uint16_t *)icmp6;	/* save type+code in one word */
 	icmp6->icmp6_type = type;
 	/* Reflect ICMPv6 -> ICMPv4 type translation in the cksum */
 	icmp6->icmp6_cksum = cksum_adjust(icmp6->icmp6_cksum,
 	    old, *(uint16_t *)icmp6);
 	if (id != 0) {
 		old = icmp6->icmp6_id;
 		icmp6->icmp6_id = id;
 		/* Reflect ICMP id translation in the cksum */
 		icmp6->icmp6_cksum = cksum_adjust(icmp6->icmp6_cksum,
 		    old, id);
 	}
 	/* Reflect IPv6 pseudo header in the cksum */
 	icmp6->icmp6_cksum = ~in6_cksum_pseudo(ip6, ntohs(ip6->ip6_plen),
 	    IPPROTO_ICMPV6, ~icmp6->icmp6_cksum);
 }
 
 static NAT64NOINLINE struct mbuf *
 nat64_icmp_translate(struct mbuf *m, struct ip6_hdr *ip6, uint16_t icmpid,
     int offset, struct nat64_config *cfg)
 {
 	struct ip ip;
 	struct icmp *icmp;
 	struct tcphdr *tcp;
 	struct udphdr *udp;
 	struct ip6_hdr *eip6;
 	struct mbuf *n;
 	uint32_t mtu;
 	int len, hlen, plen;
 	uint8_t type, code;
 
 	if (m->m_len < offset + ICMP_MINLEN)
 		m = m_pullup(m, offset + ICMP_MINLEN);
 	if (m == NULL) {
 		NAT64STAT_INC(&cfg->stats, nomem);
 		return (m);
 	}
 	mtu = 0;
 	icmp = mtodo(m, offset);
 	/* RFC 7915 p4.2 */
 	switch (icmp->icmp_type) {
 	case ICMP_ECHOREPLY:
 		type = ICMP6_ECHO_REPLY;
 		code = 0;
 		break;
 	case ICMP_UNREACH:
 		type = ICMP6_DST_UNREACH;
 		switch (icmp->icmp_code) {
 		case ICMP_UNREACH_NET:
 		case ICMP_UNREACH_HOST:
 		case ICMP_UNREACH_SRCFAIL:
 		case ICMP_UNREACH_NET_UNKNOWN:
 		case ICMP_UNREACH_HOST_UNKNOWN:
 		case ICMP_UNREACH_TOSNET:
 		case ICMP_UNREACH_TOSHOST:
 			code = ICMP6_DST_UNREACH_NOROUTE;
 			break;
 		case ICMP_UNREACH_PROTOCOL:
 			type = ICMP6_PARAM_PROB;
 			code = ICMP6_PARAMPROB_NEXTHEADER;
 			break;
 		case ICMP_UNREACH_PORT:
 			code = ICMP6_DST_UNREACH_NOPORT;
 			break;
 		case ICMP_UNREACH_NEEDFRAG:
 			type = ICMP6_PACKET_TOO_BIG;
 			code = 0;
 			/* XXX: needs an additional look */
 			mtu = max(IPV6_MMTU, ntohs(icmp->icmp_nextmtu) + 20);
 			break;
 		case ICMP_UNREACH_NET_PROHIB:
 		case ICMP_UNREACH_HOST_PROHIB:
 		case ICMP_UNREACH_FILTER_PROHIB:
 		case ICMP_UNREACH_PRECEDENCE_CUTOFF:
 			code = ICMP6_DST_UNREACH_ADMIN;
 			break;
 		default:
 			DPRINTF(DP_DROPS, "Unsupported ICMP type %d, code %d",
 			    icmp->icmp_type, icmp->icmp_code);
 			goto freeit;
 		}
 		break;
 	case ICMP_TIMXCEED:
 		type = ICMP6_TIME_EXCEEDED;
 		code = icmp->icmp_code;
 		break;
 	case ICMP_ECHO:
 		type = ICMP6_ECHO_REQUEST;
 		code = 0;
 		break;
 	case ICMP_PARAMPROB:
 		type = ICMP6_PARAM_PROB;
 		switch (icmp->icmp_code) {
 		case ICMP_PARAMPROB_ERRATPTR:
 		case ICMP_PARAMPROB_LENGTH:
 			code = ICMP6_PARAMPROB_HEADER;
 			switch (icmp->icmp_pptr) {
 			case 0: /* Version/IHL */
 			case 1: /* Type Of Service */
 				mtu = icmp->icmp_pptr;
 				break;
 			case 2: /* Total Length */
 			case 3: mtu = 4; /* Payload Length */
 				break;
 			case 8: /* Time to Live */
 				mtu = 7; /* Hop Limit */
 				break;
 			case 9: /* Protocol */
 				mtu = 6; /* Next Header */
 				break;
 			case 12: /* Source address */
 			case 13:
 			case 14:
 			case 15:
 				mtu = 8;
 				break;
 			case 16: /* Destination address */
 			case 17:
 			case 18:
 			case 19:
 				mtu = 24;
 				break;
 			default: /* Silently drop */
 				DPRINTF(DP_DROPS, "Unsupported ICMP type %d,"
 				    " code %d, pptr %d", icmp->icmp_type,
 				    icmp->icmp_code, icmp->icmp_pptr);
 				goto freeit;
 			}
 			break;
 		default:
 			DPRINTF(DP_DROPS, "Unsupported ICMP type %d,"
 			    " code %d, pptr %d", icmp->icmp_type,
 			    icmp->icmp_code, icmp->icmp_pptr);
 			goto freeit;
 		}
 		break;
 	default:
 		DPRINTF(DP_DROPS, "Unsupported ICMP type %d, code %d",
 		    icmp->icmp_type, icmp->icmp_code);
 		goto freeit;
 	}
 	/*
 	 * For echo request/reply we can use original payload,
 	 * but we need adjust icmp_cksum, because ICMPv6 cksum covers
 	 * IPv6 pseudo header and ICMPv6 types differs from ICMPv4.
 	 */
 	if (type == ICMP6_ECHO_REQUEST || type == ICMP6_ECHO_REPLY) {
 		nat64_icmp_handle_echo(ip6, ICMP6(icmp), icmpid, type);
 		return (m);
 	}
 	/*
 	 * For other types of ICMP messages we need to translate inner
 	 * IPv4 header to IPv6 header.
 	 * Assume ICMP src is the same as payload dst
 	 * E.g. we have ( GWsrc1 , NATIP1 ) in outer header
 	 * and          ( NATIP1, Hostdst1 ) in ICMP copy header.
 	 * In that case, we already have map for NATIP1 and GWsrc1.
 	 * The only thing we need is to copy IPv6 map prefix to
 	 * Hostdst1.
 	 */
 	hlen = offset + ICMP_MINLEN;
 	if (m->m_pkthdr.len < hlen + sizeof(struct ip) + ICMP_MINLEN) {
 		DPRINTF(DP_DROPS, "Message is too short %d",
 		    m->m_pkthdr.len);
 		goto freeit;
 	}
 	m_copydata(m, hlen, sizeof(struct ip), (char *)&ip);
 	if (ip.ip_v != IPVERSION) {
 		DPRINTF(DP_DROPS, "Wrong IP version %d", ip.ip_v);
 		goto freeit;
 	}
 	hlen += ip.ip_hl << 2; /* Skip inner IP header */
 	if (nat64_check_ip4(ip.ip_src.s_addr) != 0 ||
 	    nat64_check_ip4(ip.ip_dst.s_addr) != 0 ||
 	    nat64_check_private_ip4(cfg, ip.ip_src.s_addr) != 0 ||
 	    nat64_check_private_ip4(cfg, ip.ip_dst.s_addr) != 0) {
 		DPRINTF(DP_DROPS, "IP addresses checks failed %04x -> %04x",
 		    ntohl(ip.ip_src.s_addr), ntohl(ip.ip_dst.s_addr));
 		goto freeit;
 	}
 	if (m->m_pkthdr.len < hlen + ICMP_MINLEN) {
 		DPRINTF(DP_DROPS, "Message is too short %d",
 		    m->m_pkthdr.len);
 		goto freeit;
 	}
 #if 0
 	/*
 	 * Check that inner source matches the outer destination.
 	 * XXX: We need some method to convert IPv4 into IPv6 address here,
 	 *	and compare IPv6 addresses.
 	 */
 	if (ip.ip_src.s_addr != nat64_get_ip4(&ip6->ip6_dst)) {
 		DPRINTF(DP_GENERIC, "Inner source doesn't match destination ",
 		    "%04x vs %04x", ip.ip_src.s_addr,
 		    nat64_get_ip4(&ip6->ip6_dst));
 		goto freeit;
 	}
 #endif
 	/*
 	 * Create new mbuf for ICMPv6 datagram.
 	 * NOTE: len is data length just after inner IP header.
 	 */
 	len = m->m_pkthdr.len - hlen;
 	if (sizeof(struct ip6_hdr) +
 	    sizeof(struct icmp6_hdr) + len > NAT64_ICMP6_PLEN)
 		len = NAT64_ICMP6_PLEN - sizeof(struct icmp6_hdr) -
 		    sizeof(struct ip6_hdr);
 	plen = sizeof(struct icmp6_hdr) + sizeof(struct ip6_hdr) + len;
 	n = m_get2(offset + plen + max_hdr, M_NOWAIT, MT_HEADER, M_PKTHDR);
 	if (n == NULL) {
 		NAT64STAT_INC(&cfg->stats, nomem);
 		m_freem(m);
 		return (NULL);
 	}
 	m_move_pkthdr(n, m);
 	M_ALIGN(n, offset + plen + max_hdr);
 	n->m_len = n->m_pkthdr.len = offset + plen;
 	/* Adjust ip6_plen in outer header */
 	ip6->ip6_plen = htons(plen);
 	/* Construct new inner IPv6 header */
 	eip6 = mtodo(n, offset + sizeof(struct icmp6_hdr));
 	eip6->ip6_src = ip6->ip6_dst;
 
 	/* Use the same prefix that we have in outer header */
 	eip6->ip6_dst = ip6->ip6_src;
 	MPASS(cfg->flags & NAT64_PLATPFX);
 	nat64_embed_ip4(&eip6->ip6_dst, cfg->plat_plen, ip.ip_dst.s_addr);
 
 	eip6->ip6_flow = htonl(ip.ip_tos << 20);
 	eip6->ip6_vfc |= IPV6_VERSION;
 	eip6->ip6_hlim = ip.ip_ttl;
 	eip6->ip6_plen = htons(ntohs(ip.ip_len) - (ip.ip_hl << 2));
 	eip6->ip6_nxt = (ip.ip_p == IPPROTO_ICMP) ? IPPROTO_ICMPV6: ip.ip_p;
 	m_copydata(m, hlen, len, (char *)(eip6 + 1));
 	/*
 	 * We need to translate source port in the inner ULP header,
 	 * and adjust ULP checksum.
 	 */
 	switch (ip.ip_p) {
 	case IPPROTO_TCP:
 		if (len < offsetof(struct tcphdr, th_sum))
 			break;
 		tcp = TCP(eip6 + 1);
 		if (icmpid != 0) {
 			tcp->th_sum = cksum_adjust(tcp->th_sum,
 			    tcp->th_sport, icmpid);
 			tcp->th_sport = icmpid;
 		}
 		tcp->th_sum = cksum_add(tcp->th_sum,
 		    ~nat64_cksum_convert(eip6, &ip));
 		break;
 	case IPPROTO_UDP:
 		if (len < offsetof(struct udphdr, uh_sum))
 			break;
 		udp = UDP(eip6 + 1);
 		if (icmpid != 0) {
 			udp->uh_sum = cksum_adjust(udp->uh_sum,
 			    udp->uh_sport, icmpid);
 			udp->uh_sport = icmpid;
 		}
 		udp->uh_sum = cksum_add(udp->uh_sum,
 		    ~nat64_cksum_convert(eip6, &ip));
 		break;
 	case IPPROTO_ICMP:
 		/*
 		 * Check if this is an ICMP error message for echo request
 		 * that we sent. I.e. ULP in the data containing invoking
 		 * packet is IPPROTO_ICMP and its type is ICMP_ECHO.
 		 */
 		icmp = (struct icmp *)(eip6 + 1);
 		if (icmp->icmp_type != ICMP_ECHO) {
 			m_freem(n);
 			goto freeit;
 		}
 		/*
 		 * For our client this original datagram should looks
 		 * like it was ICMPv6 datagram with type ICMP6_ECHO_REQUEST.
 		 * Thus we need adjust icmp_cksum and convert type from
 		 * ICMP_ECHO to ICMP6_ECHO_REQUEST.
 		 */
 		nat64_icmp_handle_echo(eip6, ICMP6(icmp), icmpid,
 		    ICMP6_ECHO_REQUEST);
 	}
 	m_freem(m);
 	/* Convert ICMPv4 into ICMPv6 header */
 	icmp = mtodo(n, offset);
 	ICMP6(icmp)->icmp6_type = type;
 	ICMP6(icmp)->icmp6_code = code;
 	ICMP6(icmp)->icmp6_mtu = htonl(mtu);
 	ICMP6(icmp)->icmp6_cksum = 0;
 	ICMP6(icmp)->icmp6_cksum = cksum_add(
 	    ~in6_cksum_pseudo(ip6, plen, IPPROTO_ICMPV6, 0),
 	    in_cksum_skip(n, n->m_pkthdr.len, offset));
 	return (n);
 freeit:
 	m_freem(m);
 	NAT64STAT_INC(&cfg->stats, dropped);
 	return (NULL);
 }
 
 int
 nat64_getlasthdr(struct mbuf *m, int *offset)
 {
 	struct ip6_hdr *ip6;
 	struct ip6_hbh *hbh;
 	int proto, hlen;
 
 	if (offset != NULL)
 		hlen = *offset;
 	else
 		hlen = 0;
 
 	if (m->m_len < hlen + sizeof(*ip6))
 		return (-1);
 
 	ip6 = mtodo(m, hlen);
 	hlen += sizeof(*ip6);
 	proto = ip6->ip6_nxt;
 	/* Skip extension headers */
 	while (proto == IPPROTO_HOPOPTS || proto == IPPROTO_ROUTING ||
 	    proto == IPPROTO_DSTOPTS) {
 		hbh = mtodo(m, hlen);
 		/*
 		 * We expect mbuf has contigious data up to
 		 * upper level header.
 		 */
 		if (m->m_len < hlen)
 			return (-1);
 		/*
 		 * We doesn't support Jumbo payload option,
 		 * so return error.
 		 */
 		if (proto == IPPROTO_HOPOPTS && ip6->ip6_plen == 0)
 			return (-1);
 		proto = hbh->ip6h_nxt;
 		hlen += (hbh->ip6h_len + 1) << 3;
 	}
 	if (offset != NULL)
 		*offset = hlen;
 	return (proto);
 }
 
 int
 nat64_do_handle_ip4(struct mbuf *m, struct in6_addr *saddr,
     struct in6_addr *daddr, uint16_t lport, struct nat64_config *cfg,
     void *logdata)
 {
 	struct nhop_object *nh;
 	struct ip6_hdr ip6;
 	struct sockaddr_in6 dst;
 	struct ip *ip;
 	struct mbufq mq;
 	uint16_t ip_id, ip_off;
 	uint16_t *csum;
 	int plen, hlen;
 	uint8_t proto;
 
 	ip = mtod(m, struct ip*);
 
 	if (*V_nat64ipstealth == 0 && ip->ip_ttl <= IPTTLDEC) {
 		nat64_icmp_reflect(m, ICMP_TIMXCEED,
 		    ICMP_TIMXCEED_INTRANS, 0, &cfg->stats, logdata);
 		return (NAT64RETURN);
 	}
 
 	ip6.ip6_dst = *daddr;
 	ip6.ip6_src = *saddr;
 
 	hlen = ip->ip_hl << 2;
 	plen = ntohs(ip->ip_len) - hlen;
 	proto = ip->ip_p;
 
 	/* Save ip_id and ip_off, both are in network byte order */
 	ip_id = ip->ip_id;
 	ip_off = ip->ip_off & htons(IP_OFFMASK | IP_MF);
 
 	/* Fragment length must be multiple of 8 octets */
 	if ((ip->ip_off & htons(IP_MF)) != 0 && (plen & 0x7) != 0) {
 		nat64_icmp_reflect(m, ICMP_PARAMPROB,
 		    ICMP_PARAMPROB_LENGTH, 0, &cfg->stats, logdata);
 		return (NAT64RETURN);
 	}
 	/* Fragmented ICMP is unsupported */
 	if (proto == IPPROTO_ICMP && ip_off != 0) {
 		DPRINTF(DP_DROPS, "dropped due to fragmented ICMP");
 		NAT64STAT_INC(&cfg->stats, dropped);
 		return (NAT64MFREE);
 	}
 
 	dst.sin6_addr = ip6.ip6_dst;
 	nh = nat64_find_route6(&dst, m);
 	if (nh == NULL) {
 		NAT64STAT_INC(&cfg->stats, noroute6);
 		nat64_icmp_reflect(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0,
 		    &cfg->stats, logdata);
 		return (NAT64RETURN);
 	}
 	if (nh->nh_mtu < plen + sizeof(ip6) &&
 	    (ip->ip_off & htons(IP_DF)) != 0) {
 		nat64_icmp_reflect(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
 		    FRAGSZ(nh->nh_mtu) + sizeof(struct ip), &cfg->stats, logdata);
 		return (NAT64RETURN);
 	}
 
 	ip6.ip6_flow = htonl(ip->ip_tos << 20);
 	ip6.ip6_vfc |= IPV6_VERSION;
 	ip6.ip6_hlim = ip->ip_ttl;
 	if (*V_nat64ipstealth == 0)
 		ip6.ip6_hlim -= IPTTLDEC;
 	ip6.ip6_plen = htons(plen);
 	ip6.ip6_nxt = (proto == IPPROTO_ICMP) ? IPPROTO_ICMPV6: proto;
 
 	/* Handle delayed checksums if needed. */
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 	/* Convert checksums. */
 	switch (proto) {
 	case IPPROTO_TCP:
 		csum = &TCP(mtodo(m, hlen))->th_sum;
 		if (lport != 0) {
 			struct tcphdr *tcp = TCP(mtodo(m, hlen));
 			*csum = cksum_adjust(*csum, tcp->th_dport, lport);
 			tcp->th_dport = lport;
 		}
 		*csum = cksum_add(*csum, ~nat64_cksum_convert(&ip6, ip));
 		break;
 	case IPPROTO_UDP:
 		csum = &UDP(mtodo(m, hlen))->uh_sum;
 		if (lport != 0) {
 			struct udphdr *udp = UDP(mtodo(m, hlen));
 			*csum = cksum_adjust(*csum, udp->uh_dport, lport);
 			udp->uh_dport = lport;
 		}
 		*csum = cksum_add(*csum, ~nat64_cksum_convert(&ip6, ip));
 		break;
 	case IPPROTO_ICMP:
 		m = nat64_icmp_translate(m, &ip6, lport, hlen, cfg);
 		if (m == NULL)	/* stats already accounted */
 			return (NAT64RETURN);
 	}
 
 	m_adj(m, hlen);
 	mbufq_init(&mq, 255);
 	nat64_fragment6(&cfg->stats, &ip6, &mq, m, nh->nh_mtu, ip_id, ip_off);
 	while ((m = mbufq_dequeue(&mq)) != NULL) {
 		if (V_nat64out->output(nh->nh_ifp, m, (struct sockaddr *)&dst,
 		    &cfg->stats, logdata) != 0)
 			break;
 		NAT64STAT_INC(&cfg->stats, opcnt46);
 	}
 	mbufq_drain(&mq);
 	return (NAT64RETURN);
 }
 
 int
 nat64_handle_icmp6(struct mbuf *m, int hlen, uint32_t aaddr, uint16_t aport,
     struct nat64_config *cfg, void *logdata)
 {
 	struct ip ip;
 	struct icmp6_hdr *icmp6;
 	struct ip6_frag *ip6f;
 	struct ip6_hdr *ip6, *ip6i;
 	uint32_t mtu;
 	int plen, proto;
 	uint8_t type, code;
 
 	if (hlen == 0) {
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (nat64_check_ip6(&ip6->ip6_src) != 0 ||
 		    nat64_check_ip6(&ip6->ip6_dst) != 0)
 			return (NAT64SKIP);
 
 		proto = nat64_getlasthdr(m, &hlen);
 		if (proto != IPPROTO_ICMPV6) {
 			DPRINTF(DP_DROPS,
 			    "dropped due to mbuf isn't contigious");
 			NAT64STAT_INC(&cfg->stats, dropped);
 			return (NAT64MFREE);
 		}
 	}
 
 	/*
 	 * Translate ICMPv6 type and code to ICMPv4 (RFC7915).
 	 * NOTE: ICMPv6 echo handled by nat64_do_handle_ip6().
 	 */
 	icmp6 = mtodo(m, hlen);
 	mtu = 0;
 	switch (icmp6->icmp6_type) {
 	case ICMP6_DST_UNREACH:
 		type = ICMP_UNREACH;
 		switch (icmp6->icmp6_code) {
 		case ICMP6_DST_UNREACH_NOROUTE:
 		case ICMP6_DST_UNREACH_BEYONDSCOPE:
 		case ICMP6_DST_UNREACH_ADDR:
 			code = ICMP_UNREACH_HOST;
 			break;
 		case ICMP6_DST_UNREACH_ADMIN:
 			code = ICMP_UNREACH_HOST_PROHIB;
 			break;
 		case ICMP6_DST_UNREACH_NOPORT:
 			code = ICMP_UNREACH_PORT;
 			break;
 		default:
 			DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d,"
 			    " code %d", icmp6->icmp6_type,
 			    icmp6->icmp6_code);
 			NAT64STAT_INC(&cfg->stats, dropped);
 			return (NAT64MFREE);
 		}
 		break;
 	case ICMP6_PACKET_TOO_BIG:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
 		mtu = ntohl(icmp6->icmp6_mtu);
 		if (mtu < IPV6_MMTU) {
 			DPRINTF(DP_DROPS, "Wrong MTU %d in ICMPv6 type %d,"
 			    " code %d", mtu, icmp6->icmp6_type,
 			    icmp6->icmp6_code);
 			NAT64STAT_INC(&cfg->stats, dropped);
 			return (NAT64MFREE);
 		}
 		/*
 		 * Adjust MTU to reflect difference between
 		 * IPv6 an IPv4 headers.
 		 */
 		mtu -= sizeof(struct ip6_hdr) - sizeof(struct ip);
 		break;
 	case ICMP6_TIME_EXCEEDED:
 		type = ICMP_TIMXCEED;
 		code = icmp6->icmp6_code;
 		break;
 	case ICMP6_PARAM_PROB:
 		switch (icmp6->icmp6_code) {
 		case ICMP6_PARAMPROB_HEADER:
 			type = ICMP_PARAMPROB;
 			code = ICMP_PARAMPROB_ERRATPTR;
 			mtu = ntohl(icmp6->icmp6_pptr);
 			switch (mtu) {
 			case 0: /* Version/Traffic Class */
 			case 1: /* Traffic Class/Flow Label */
 				break;
 			case 4: /* Payload Length */
 			case 5:
 				mtu = 2;
 				break;
 			case 6: /* Next Header */
 				mtu = 9;
 				break;
 			case 7: /* Hop Limit */
 				mtu = 8;
 				break;
 			default:
 				if (mtu >= 8 && mtu <= 23) {
 					mtu = 12; /* Source address */
 					break;
 				}
 				if (mtu >= 24 && mtu <= 39) {
 					mtu = 16; /* Destination address */
 					break;
 				}
 				DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d,"
 				    " code %d, pptr %d", icmp6->icmp6_type,
 				    icmp6->icmp6_code, mtu);
 				NAT64STAT_INC(&cfg->stats, dropped);
 				return (NAT64MFREE);
 			}
 		case ICMP6_PARAMPROB_NEXTHEADER:
 			type = ICMP_UNREACH;
 			code = ICMP_UNREACH_PROTOCOL;
 			break;
 		default:
 			DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d,"
 			    " code %d, pptr %d", icmp6->icmp6_type,
 			    icmp6->icmp6_code, ntohl(icmp6->icmp6_pptr));
 			NAT64STAT_INC(&cfg->stats, dropped);
 			return (NAT64MFREE);
 		}
 		break;
 	default:
 		DPRINTF(DP_DROPS, "Unsupported ICMPv6 type %d, code %d",
 		    icmp6->icmp6_type, icmp6->icmp6_code);
 		NAT64STAT_INC(&cfg->stats, dropped);
 		return (NAT64MFREE);
 	}
 
 	hlen += sizeof(struct icmp6_hdr);
 	if (m->m_pkthdr.len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN) {
 		NAT64STAT_INC(&cfg->stats, dropped);
 		DPRINTF(DP_DROPS, "Message is too short %d",
 		    m->m_pkthdr.len);
 		return (NAT64MFREE);
 	}
 	/*
 	 * We need at least ICMP_MINLEN bytes of original datagram payload
 	 * to generate ICMP message. It is nice that ICMP_MINLEN is equal
 	 * to sizeof(struct ip6_frag). So, if embedded datagram had a fragment
 	 * header we will not have to do m_pullup() again.
 	 *
 	 * What we have here:
 	 * Outer header: (IPv6iGW, v4mapPRefix+v4exthost)
 	 * Inner header: (v4mapPRefix+v4host, IPv6iHost) [sport, dport]
 	 * We need to translate it to:
 	 *
 	 * Outer header: (alias_host, v4exthost)
 	 * Inner header: (v4exthost, alias_host) [sport, alias_port]
 	 *
 	 * Assume caller function has checked if v4mapPRefix+v4host
 	 * matches configured prefix.
 	 * The only two things we should be provided with are mapping between
 	 * IPv6iHost <> alias_host and between dport and alias_port.
 	 */
 	if (m->m_len < hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN)
 		m = m_pullup(m, hlen + sizeof(struct ip6_hdr) + ICMP_MINLEN);
 	if (m == NULL) {
 		NAT64STAT_INC(&cfg->stats, nomem);
 		return (NAT64RETURN);
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6i = mtodo(m, hlen);
 	ip6f = NULL;
 	proto = ip6i->ip6_nxt;
 	plen = ntohs(ip6i->ip6_plen);
 	hlen += sizeof(struct ip6_hdr);
 	if (proto == IPPROTO_FRAGMENT) {
 		if (m->m_pkthdr.len < hlen + sizeof(struct ip6_frag) +
 		    ICMP_MINLEN)
 			goto fail;
 		ip6f = mtodo(m, hlen);
 		proto = ip6f->ip6f_nxt;
 		plen -= sizeof(struct ip6_frag);
 		hlen += sizeof(struct ip6_frag);
 		/* Ajust MTU to reflect frag header size */
 		if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG)
 			mtu -= sizeof(struct ip6_frag);
 	}
 	if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
 		DPRINTF(DP_DROPS, "Unsupported proto %d in the inner header",
 		    proto);
 		goto fail;
 	}
 	if (nat64_check_ip6(&ip6i->ip6_src) != 0 ||
 	    nat64_check_ip6(&ip6i->ip6_dst) != 0) {
 		DPRINTF(DP_DROPS, "Inner addresses do not passes the check");
 		goto fail;
 	}
 	/* Check if outer dst is the same as inner src */
 	if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6i->ip6_src)) {
 		DPRINTF(DP_DROPS, "Inner src doesn't match outer dst");
 		goto fail;
 	}
 
 	/* Now we need to make a fake IPv4 packet to generate ICMP message */
 	ip.ip_dst.s_addr = aaddr;
 	ip.ip_src.s_addr = nat64_extract_ip4(&ip6i->ip6_src, cfg->plat_plen);
 	if (ip.ip_src.s_addr == 0)
 		goto fail;
 	/* XXX: Make fake ulp header */
 	if (V_nat64out == &nat64_direct) /* init_ip4hdr will decrement it */
 		ip6i->ip6_hlim += IPV6_HLIMDEC;
 	nat64_init_ip4hdr(ip6i, ip6f, plen, proto, &ip);
 	m_adj(m, hlen - sizeof(struct ip));
 	bcopy(&ip, mtod(m, void *), sizeof(ip));
 	nat64_icmp_reflect(m, type, code, (uint16_t)mtu, &cfg->stats,
 	    logdata);
 	return (NAT64RETURN);
 fail:
 	/*
 	 * We must call m_freem() because mbuf pointer could be
 	 * changed with m_pullup().
 	 */
 	m_freem(m);
 	NAT64STAT_INC(&cfg->stats, dropped);
 	return (NAT64RETURN);
 }
 
 int
 nat64_do_handle_ip6(struct mbuf *m, uint32_t aaddr, uint16_t aport,
     struct nat64_config *cfg, void *logdata)
 {
 	struct ip ip;
 	struct nhop_object *nh;
 	struct sockaddr_in dst;
 	struct ip6_frag *frag;
 	struct ip6_hdr *ip6;
 	struct icmp6_hdr *icmp6;
 	uint16_t *csum;
 	int plen, hlen, proto;
 
 	/*
 	 * XXX: we expect ipfw_chk() did m_pullup() up to upper level
 	 * protocol's headers. Also we skip some checks, that ip6_input(),
 	 * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did.
 	 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (nat64_check_ip6(&ip6->ip6_src) != 0 ||
 	    nat64_check_ip6(&ip6->ip6_dst) != 0) {
 		return (NAT64SKIP);
 	}
 
 	/* Starting from this point we must not return zero */
 	ip.ip_src.s_addr = aaddr;
 	if (nat64_check_ip4(ip.ip_src.s_addr) != 0) {
 		DPRINTF(DP_GENERIC | DP_DROPS, "invalid source address: %08x",
 		    ip.ip_src.s_addr);
 		NAT64STAT_INC(&cfg->stats, dropped);
 		return (NAT64MFREE);
 	}
 
 	ip.ip_dst.s_addr = nat64_extract_ip4(&ip6->ip6_dst, cfg->plat_plen);
 	if (ip.ip_dst.s_addr == 0) {
 		NAT64STAT_INC(&cfg->stats, dropped);
 		return (NAT64MFREE);
 	}
 
 	if (*V_nat64ip6stealth == 0 && ip6->ip6_hlim <= IPV6_HLIMDEC) {
 		nat64_icmp6_reflect(m, ICMP6_TIME_EXCEEDED,
 		    ICMP6_TIME_EXCEED_TRANSIT, 0, &cfg->stats, logdata);
 		return (NAT64RETURN);
 	}
 
 	hlen = 0;
 	plen = ntohs(ip6->ip6_plen);
 	proto = nat64_getlasthdr(m, &hlen);
 	if (proto < 0) {
 		DPRINTF(DP_DROPS, "dropped due to mbuf isn't contigious");
 		NAT64STAT_INC(&cfg->stats, dropped);
 		return (NAT64MFREE);
 	}
 	frag = NULL;
 	if (proto == IPPROTO_FRAGMENT) {
 		/* ipfw_chk should m_pullup up to frag header */
 		if (m->m_len < hlen + sizeof(*frag)) {
 			DPRINTF(DP_DROPS,
 			    "dropped due to mbuf isn't contigious");
 			NAT64STAT_INC(&cfg->stats, dropped);
 			return (NAT64MFREE);
 		}
 		frag = mtodo(m, hlen);
 		proto = frag->ip6f_nxt;
 		hlen += sizeof(*frag);
 		/* Fragmented ICMPv6 is unsupported */
 		if (proto == IPPROTO_ICMPV6) {
 			DPRINTF(DP_DROPS, "dropped due to fragmented ICMPv6");
 			NAT64STAT_INC(&cfg->stats, dropped);
 			return (NAT64MFREE);
 		}
 		/* Fragment length must be multiple of 8 octets */
 		if ((frag->ip6f_offlg & IP6F_MORE_FRAG) != 0 &&
 		    ((plen + sizeof(struct ip6_hdr) - hlen) & 0x7) != 0) {
 			nat64_icmp6_reflect(m, ICMP6_PARAM_PROB,
 			    ICMP6_PARAMPROB_HEADER,
 			    offsetof(struct ip6_hdr, ip6_plen), &cfg->stats,
 			    logdata);
 			return (NAT64RETURN);
 		}
 	}
 	plen -= hlen - sizeof(struct ip6_hdr);
 	if (plen < 0 || m->m_pkthdr.len < plen + hlen) {
 		DPRINTF(DP_DROPS, "plen %d, pkthdr.len %d, hlen %d",
 		    plen, m->m_pkthdr.len, hlen);
 		NAT64STAT_INC(&cfg->stats, dropped);
 		return (NAT64MFREE);
 	}
 
 	icmp6 = NULL;	/* Make gcc happy */
 	if (proto == IPPROTO_ICMPV6) {
 		icmp6 = mtodo(m, hlen);
 		if (icmp6->icmp6_type != ICMP6_ECHO_REQUEST &&
 		    icmp6->icmp6_type != ICMP6_ECHO_REPLY)
 			return (nat64_handle_icmp6(m, hlen, aaddr, aport,
 			    cfg, logdata));
 	}
 	dst.sin_addr.s_addr = ip.ip_dst.s_addr;
 	nh = nat64_find_route4(&dst, m);
 	if (nh == NULL) {
 		NAT64STAT_INC(&cfg->stats, noroute4);
 		nat64_icmp6_reflect(m, ICMP6_DST_UNREACH,
 		    ICMP6_DST_UNREACH_NOROUTE, 0, &cfg->stats, logdata);
 		return (NAT64RETURN);
 	}
 	if (nh->nh_mtu < plen + sizeof(ip)) {
 		nat64_icmp6_reflect(m, ICMP6_PACKET_TOO_BIG, 0, nh->nh_mtu,
 		    &cfg->stats, logdata);
 		return (NAT64RETURN);
 	}
 	nat64_init_ip4hdr(ip6, frag, plen, proto, &ip);
 
 	/* Handle delayed checksums if needed. */
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 		in6_delayed_cksum(m, plen, hlen);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 	}
 	/* Convert checksums. */
 	switch (proto) {
 	case IPPROTO_TCP:
 		csum = &TCP(mtodo(m, hlen))->th_sum;
 		if (aport != 0) {
 			struct tcphdr *tcp = TCP(mtodo(m, hlen));
 			*csum = cksum_adjust(*csum, tcp->th_sport, aport);
 			tcp->th_sport = aport;
 		}
 		*csum = cksum_add(*csum, nat64_cksum_convert(ip6, &ip));
 		break;
 	case IPPROTO_UDP:
 		csum = &UDP(mtodo(m, hlen))->uh_sum;
 		if (aport != 0) {
 			struct udphdr *udp = UDP(mtodo(m, hlen));
 			*csum = cksum_adjust(*csum, udp->uh_sport, aport);
 			udp->uh_sport = aport;
 		}
 		*csum = cksum_add(*csum, nat64_cksum_convert(ip6, &ip));
 		break;
 	case IPPROTO_ICMPV6:
 		/* Checksum in ICMPv6 covers pseudo header */
 		csum = &icmp6->icmp6_cksum;
 		*csum = cksum_add(*csum, in6_cksum_pseudo(ip6, plen,
 		    IPPROTO_ICMPV6, 0));
 		/* Convert ICMPv6 types to ICMP */
 		proto = *(uint16_t *)icmp6; /* save old word for cksum_adjust */
 		if (icmp6->icmp6_type == ICMP6_ECHO_REQUEST)
 			icmp6->icmp6_type = ICMP_ECHO;
 		else /* ICMP6_ECHO_REPLY */
 			icmp6->icmp6_type = ICMP_ECHOREPLY;
 		*csum = cksum_adjust(*csum, (uint16_t)proto,
 		    *(uint16_t *)icmp6);
 		if (aport != 0) {
 			uint16_t old_id = icmp6->icmp6_id;
 			icmp6->icmp6_id = aport;
 			*csum = cksum_adjust(*csum, old_id, aport);
 		}
 		break;
 	};
 
 	m_adj(m, hlen - sizeof(ip));
 	bcopy(&ip, mtod(m, void *), sizeof(ip));
 	if (V_nat64out->output(nh->nh_ifp, m, (struct sockaddr *)&dst,
 	    &cfg->stats, logdata) == 0)
 		NAT64STAT_INC(&cfg->stats, opcnt64);
 	return (NAT64RETURN);
 }
diff --git a/sys/netpfil/ipfw/nptv6/nptv6.c b/sys/netpfil/ipfw/nptv6/nptv6.c
index de5be6836ab4..4f966e4b5ec6 100644
--- a/sys/netpfil/ipfw/nptv6/nptv6.c
+++ b/sys/netpfil/ipfw/nptv6/nptv6.c
@@ -1,1041 +1,1042 @@
 /*-
  * Copyright (c) 2016 Yandex LLC
  * Copyright (c) 2016 Andrey V. Elsukov <ae@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
 #include <sys/eventhandler.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/queue.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/nptv6/nptv6.h>
 
 VNET_DEFINE_STATIC(uint16_t, nptv6_eid) = 0;
 #define	V_nptv6_eid	VNET(nptv6_eid)
 #define	IPFW_TLV_NPTV6_NAME	IPFW_TLV_EACTION_NAME(V_nptv6_eid)
 
 static eventhandler_tag nptv6_ifaddr_event;
 
 static struct nptv6_cfg *nptv6_alloc_config(const char *name, uint8_t set);
 static void nptv6_free_config(struct nptv6_cfg *cfg);
 static struct nptv6_cfg *nptv6_find(struct namedobj_instance *ni,
     const char *name, uint8_t set);
 static int nptv6_rewrite_internal(struct nptv6_cfg *cfg, struct mbuf **mp,
     int offset);
 static int nptv6_rewrite_external(struct nptv6_cfg *cfg, struct mbuf **mp,
     int offset);
 
 #define	NPTV6_LOOKUP(chain, cmd)	\
     (struct nptv6_cfg *)SRV_OBJECT((chain), (cmd)->arg1)
 
 #ifndef IN6_MASK_ADDR
 #define IN6_MASK_ADDR(a, m)	do { \
 	(a)->s6_addr32[0] &= (m)->s6_addr32[0]; \
 	(a)->s6_addr32[1] &= (m)->s6_addr32[1]; \
 	(a)->s6_addr32[2] &= (m)->s6_addr32[2]; \
 	(a)->s6_addr32[3] &= (m)->s6_addr32[3]; \
 } while (0)
 #endif
 #ifndef IN6_ARE_MASKED_ADDR_EQUAL
 #define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m)	(	\
 	(((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \
 	(((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \
 	(((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \
 	(((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 )
 #endif
 
 #if 0
 #define	NPTV6_DEBUG(fmt, ...)	do {			\
 	printf("%s: " fmt "\n", __func__, ## __VA_ARGS__);	\
 } while (0)
 #define	NPTV6_IPDEBUG(fmt, ...)	do {			\
 	char _s[INET6_ADDRSTRLEN], _d[INET6_ADDRSTRLEN];	\
 	printf("%s: " fmt "\n", __func__, ## __VA_ARGS__);	\
 } while (0)
 #else
 #define	NPTV6_DEBUG(fmt, ...)
 #define	NPTV6_IPDEBUG(fmt, ...)
 #endif
 
 static int
 nptv6_getlasthdr(struct nptv6_cfg *cfg, struct mbuf *m, int *offset)
 {
 	struct ip6_hdr *ip6;
 	struct ip6_hbh *hbh;
 	int proto, hlen;
 
 	hlen = (offset == NULL) ? 0: *offset;
 	if (m->m_len < hlen)
 		return (-1);
 	ip6 = mtodo(m, hlen);
 	hlen += sizeof(*ip6);
 	proto = ip6->ip6_nxt;
 	while (proto == IPPROTO_HOPOPTS || proto == IPPROTO_ROUTING ||
 	    proto == IPPROTO_DSTOPTS) {
 		hbh = mtodo(m, hlen);
 		if (m->m_len < hlen)
 			return (-1);
 		proto = hbh->ip6h_nxt;
 		hlen += (hbh->ip6h_len + 1) << 3;
 	}
 	if (offset != NULL)
 		*offset = hlen;
 	return (proto);
 }
 
 static int
 nptv6_translate_icmpv6(struct nptv6_cfg *cfg, struct mbuf **mp, int offset)
 {
 	struct icmp6_hdr *icmp6;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 
 	m = *mp;
 	if (offset > m->m_len)
 		return (-1);
 	icmp6 = mtodo(m, offset);
 	NPTV6_DEBUG("ICMPv6 type %d", icmp6->icmp6_type);
 	switch (icmp6->icmp6_type) {
 	case ICMP6_DST_UNREACH:
 	case ICMP6_PACKET_TOO_BIG:
 	case ICMP6_TIME_EXCEEDED:
 	case ICMP6_PARAM_PROB:
 		break;
 	case ICMP6_ECHO_REQUEST:
 	case ICMP6_ECHO_REPLY:
 		/* nothing to translate */
 		return (0);
 	default:
 		/*
 		 * XXX: We can add some checks to not translate NDP and MLD
 		 * messages. Currently user must explicitly allow these message
 		 * types, otherwise packets will be dropped.
 		 */
 		return (-1);
 	}
 	offset += sizeof(*icmp6);
 	if (offset + sizeof(*ip6) > m->m_pkthdr.len)
 		return (-1);
 	if (offset + sizeof(*ip6) > m->m_len)
 		*mp = m = m_pullup(m, offset + sizeof(*ip6));
 	if (m == NULL)
 		return (-1);
 	ip6 = mtodo(m, offset);
 	NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset,
 	    inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)),
 	    inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)),
 	    ip6->ip6_nxt);
 	if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_src,
 	    &cfg->external, &cfg->mask))
 		return (nptv6_rewrite_external(cfg, mp, offset));
 	else if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst,
 	    &cfg->internal, &cfg->mask))
 		return (nptv6_rewrite_internal(cfg, mp, offset));
 	/*
 	 * Addresses in the inner IPv6 header doesn't matched to
 	 * our prefixes.
 	 */
 	return (-1);
 }
 
 static int
 nptv6_search_index(struct nptv6_cfg *cfg, struct in6_addr *a)
 {
 	int idx;
 
 	if (cfg->flags & NPTV6_48PLEN)
 		return (3);
 
 	/* Search suitable word index for adjustment */
 	for (idx = 4; idx < 8; idx++)
 		if (a->s6_addr16[idx] != 0xffff)
 			break;
 	/*
 	 * RFC 6296 p3.7: If an NPTv6 Translator discovers a datagram with
 	 * an IID of all-zeros while performing address mapping, that
 	 * datagram MUST be dropped, and an ICMPv6 Parameter Problem error
 	 * SHOULD be generated.
 	 */
 	if (idx == 8 ||
 	    (a->s6_addr32[2] == 0 && a->s6_addr32[3] == 0))
 		return (-1);
 	return (idx);
 }
 
 static void
 nptv6_copy_addr(struct in6_addr *src, struct in6_addr *dst,
     struct in6_addr *mask)
 {
 	int i;
 
 	for (i = 0; i < 8 && mask->s6_addr8[i] != 0; i++) {
 		dst->s6_addr8[i] &=  ~mask->s6_addr8[i];
 		dst->s6_addr8[i] |= src->s6_addr8[i] & mask->s6_addr8[i];
 	}
 }
 
 static int
 nptv6_rewrite_internal(struct nptv6_cfg *cfg, struct mbuf **mp, int offset)
 {
 	struct in6_addr *addr;
 	struct ip6_hdr *ip6;
 	int idx, proto;
 	uint16_t adj;
 
 	ip6 = mtodo(*mp, offset);
 	NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset,
 	    inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)),
 	    inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)),
 	    ip6->ip6_nxt);
 	if (offset == 0)
 		addr = &ip6->ip6_src;
 	else {
 		/*
 		 * When we rewriting inner IPv6 header, we need to rewrite
 		 * destination address back to external prefix. The datagram in
 		 * the ICMPv6 payload should looks like it was send from
 		 * external prefix.
 		 */
 		addr = &ip6->ip6_dst;
 	}
 	idx = nptv6_search_index(cfg, addr);
 	if (idx < 0) {
 		/*
 		 * Do not send ICMPv6 error when offset isn't zero.
 		 * This means we are rewriting inner IPv6 header in the
 		 * ICMPv6 error message.
 		 */
 		if (offset == 0) {
 			icmp6_error2(*mp, ICMP6_DST_UNREACH,
 			    ICMP6_DST_UNREACH_ADDR, 0, (*mp)->m_pkthdr.rcvif);
 			*mp = NULL;
 		}
 		return (IP_FW_DENY);
 	}
 	adj = addr->s6_addr16[idx];
 	nptv6_copy_addr(&cfg->external, addr, &cfg->mask);
 	adj = cksum_add(adj, cfg->adjustment);
 	if (adj == 0xffff)
 		adj = 0;
 	addr->s6_addr16[idx] = adj;
 	if (offset == 0) {
 		/*
 		 * We may need to translate addresses in the inner IPv6
 		 * header for ICMPv6 error messages.
 		 */
 		proto = nptv6_getlasthdr(cfg, *mp, &offset);
 		if (proto < 0 || (proto == IPPROTO_ICMPV6 &&
 		    nptv6_translate_icmpv6(cfg, mp, offset) != 0))
 			return (IP_FW_DENY);
 		NPTV6STAT_INC(cfg, in2ex);
 	}
 	return (0);
 }
 
 static int
 nptv6_rewrite_external(struct nptv6_cfg *cfg, struct mbuf **mp, int offset)
 {
 	struct in6_addr *addr;
 	struct ip6_hdr *ip6;
 	int idx, proto;
 	uint16_t adj;
 
 	ip6 = mtodo(*mp, offset);
 	NPTV6_IPDEBUG("offset %d, %s -> %s %d", offset,
 	    inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)),
 	    inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)),
 	    ip6->ip6_nxt);
 	if (offset == 0)
 		addr = &ip6->ip6_dst;
 	else {
 		/*
 		 * When we rewriting inner IPv6 header, we need to rewrite
 		 * source address back to internal prefix. The datagram in
 		 * the ICMPv6 payload should looks like it was send from
 		 * internal prefix.
 		 */
 		addr = &ip6->ip6_src;
 	}
 	idx = nptv6_search_index(cfg, addr);
 	if (idx < 0) {
 		/*
 		 * Do not send ICMPv6 error when offset isn't zero.
 		 * This means we are rewriting inner IPv6 header in the
 		 * ICMPv6 error message.
 		 */
 		if (offset == 0) {
 			icmp6_error2(*mp, ICMP6_DST_UNREACH,
 			    ICMP6_DST_UNREACH_ADDR, 0, (*mp)->m_pkthdr.rcvif);
 			*mp = NULL;
 		}
 		return (IP_FW_DENY);
 	}
 	adj = addr->s6_addr16[idx];
 	nptv6_copy_addr(&cfg->internal, addr, &cfg->mask);
 	adj = cksum_add(adj, ~cfg->adjustment);
 	if (adj == 0xffff)
 		adj = 0;
 	addr->s6_addr16[idx] = adj;
 	if (offset == 0) {
 		/*
 		 * We may need to translate addresses in the inner IPv6
 		 * header for ICMPv6 error messages.
 		 */
 		proto = nptv6_getlasthdr(cfg, *mp, &offset);
 		if (proto < 0 || (proto == IPPROTO_ICMPV6 &&
 		    nptv6_translate_icmpv6(cfg, mp, offset) != 0))
 			return (IP_FW_DENY);
 		NPTV6STAT_INC(cfg, ex2in);
 	}
 	return (0);
 }
 
 /*
  * ipfw external action handler.
  */
 static int
 ipfw_nptv6(struct ip_fw_chain *chain, struct ip_fw_args *args,
     ipfw_insn *cmd, int *done)
 {
 	struct ip6_hdr *ip6;
 	struct nptv6_cfg *cfg;
 	ipfw_insn *icmd;
 	int ret;
 
 	*done = 0; /* try next rule if not matched */
 	ret = IP_FW_DENY;
 	icmd = cmd + 1;
 	if (cmd->opcode != O_EXTERNAL_ACTION ||
 	    cmd->arg1 != V_nptv6_eid ||
 	    icmd->opcode != O_EXTERNAL_INSTANCE ||
 	    (cfg = NPTV6_LOOKUP(chain, icmd)) == NULL ||
 	    (cfg->flags & NPTV6_READY) == 0)
 		return (ret);
 	/*
 	 * We need act as router, so when forwarding is disabled -
 	 * do nothing.
 	 */
 	if (V_ip6_forwarding == 0 || args->f_id.addr_type != 6)
 		return (ret);
 	/*
 	 * NOTE: we expect ipfw_chk() did m_pullup() up to upper level
 	 * protocol's headers. Also we skip some checks, that ip6_input(),
 	 * ip6_forward(), ip6_fastfwd() and ipfw_chk() already did.
 	 */
 	ip6 = mtod(args->m, struct ip6_hdr *);
 	NPTV6_IPDEBUG("eid %u, oid %u, %s -> %s %d",
 	    cmd->arg1, icmd->arg1,
 	    inet_ntop(AF_INET6, &ip6->ip6_src, _s, sizeof(_s)),
 	    inet_ntop(AF_INET6, &ip6->ip6_dst, _d, sizeof(_d)),
 	    ip6->ip6_nxt);
 	if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_src,
 	    &cfg->internal, &cfg->mask)) {
 		/*
 		 * XXX: Do not translate packets when both src and dst
 		 * are from internal prefix.
 		 */
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst,
 		    &cfg->internal, &cfg->mask))
 			return (ret);
 		ret = nptv6_rewrite_internal(cfg, &args->m, 0);
 	} else if (IN6_ARE_MASKED_ADDR_EQUAL(&ip6->ip6_dst,
 	    &cfg->external, &cfg->mask))
 		ret = nptv6_rewrite_external(cfg, &args->m, 0);
 	else
 		return (ret);
 	/*
 	 * If address wasn't rewrited - free mbuf and terminate the search.
 	 */
 	if (ret != 0) {
 		if (args->m != NULL) {
 			m_freem(args->m);
 			args->m = NULL; /* mark mbuf as consumed */
 		}
 		NPTV6STAT_INC(cfg, dropped);
 		*done = 1;
 	} else {
 		/* Terminate the search if one_pass is set */
 		*done = V_fw_one_pass;
 		/* Update args->f_id when one_pass is off */
 		if (*done == 0) {
 			ip6 = mtod(args->m, struct ip6_hdr *);
 			args->f_id.src_ip6 = ip6->ip6_src;
 			args->f_id.dst_ip6 = ip6->ip6_dst;
 		}
 	}
 	return (ret);
 }
 
 static struct nptv6_cfg *
 nptv6_alloc_config(const char *name, uint8_t set)
 {
 	struct nptv6_cfg *cfg;
 
 	cfg = malloc(sizeof(struct nptv6_cfg), M_IPFW, M_WAITOK | M_ZERO);
 	COUNTER_ARRAY_ALLOC(cfg->stats, NPTV6STATS, M_WAITOK);
 	cfg->no.name = cfg->name;
 	cfg->no.etlv = IPFW_TLV_NPTV6_NAME;
 	cfg->no.set = set;
 	strlcpy(cfg->name, name, sizeof(cfg->name));
 	return (cfg);
 }
 
 static void
 nptv6_free_config(struct nptv6_cfg *cfg)
 {
 
 	COUNTER_ARRAY_FREE(cfg->stats, NPTV6STATS);
 	free(cfg, M_IPFW);
 }
 
 static void
 nptv6_export_config(struct ip_fw_chain *ch, struct nptv6_cfg *cfg,
     ipfw_nptv6_cfg *uc)
 {
 
 	uc->internal = cfg->internal;
 	if (cfg->flags & NPTV6_DYNAMIC_PREFIX)
 		memcpy(uc->if_name, cfg->if_name, IF_NAMESIZE);
 	else
 		uc->external = cfg->external;
 	uc->plen = cfg->plen;
 	uc->flags = cfg->flags & NPTV6_FLAGSMASK;
 	uc->set = cfg->no.set;
 	strlcpy(uc->name, cfg->no.name, sizeof(uc->name));
 }
 
 struct nptv6_dump_arg {
 	struct ip_fw_chain *ch;
 	struct sockopt_data *sd;
 };
 
 static int
 export_config_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct nptv6_dump_arg *da = (struct nptv6_dump_arg *)arg;
 	ipfw_nptv6_cfg *uc;
 
 	uc = (ipfw_nptv6_cfg *)ipfw_get_sopt_space(da->sd, sizeof(*uc));
 	nptv6_export_config(da->ch, (struct nptv6_cfg *)no, uc);
 	return (0);
 }
 
 static struct nptv6_cfg *
 nptv6_find(struct namedobj_instance *ni, const char *name, uint8_t set)
 {
 	struct nptv6_cfg *cfg;
 
 	cfg = (struct nptv6_cfg *)ipfw_objhash_lookup_name_type(ni, set,
 	    IPFW_TLV_NPTV6_NAME, name);
 
 	return (cfg);
 }
 
 static void
 nptv6_calculate_adjustment(struct nptv6_cfg *cfg)
 {
 	uint16_t i, e;
 	uint16_t *p;
 
 	/* Calculate checksum of internal prefix */
 	for (i = 0, p = (uint16_t *)&cfg->internal;
 	    p < (uint16_t *)(&cfg->internal + 1); p++)
 		i = cksum_add(i, *p);
 
 	/* Calculate checksum of external prefix */
 	for (e = 0, p = (uint16_t *)&cfg->external;
 	    p < (uint16_t *)(&cfg->external + 1); p++)
 		e = cksum_add(e, *p);
 
 	/* Adjustment value for Int->Ext direction */
 	cfg->adjustment = cksum_add(~e, i);
 }
 
 static int
 nptv6_check_prefix(const struct in6_addr *addr)
 {
 
 	if (IN6_IS_ADDR_MULTICAST(addr) ||
 	    IN6_IS_ADDR_LINKLOCAL(addr) ||
 	    IN6_IS_ADDR_LOOPBACK(addr) ||
 	    IN6_IS_ADDR_UNSPECIFIED(addr))
 		return (EINVAL);
 	return (0);
 }
 
 static void
 nptv6_set_external(struct nptv6_cfg *cfg, struct in6_addr *addr)
 {
 
 	cfg->external = *addr;
 	IN6_MASK_ADDR(&cfg->external, &cfg->mask);
 	nptv6_calculate_adjustment(cfg);
 	cfg->flags |= NPTV6_READY;
 }
 
 /*
  * Try to determine what prefix to use as external for
  * configured interface name.
  */
 static void
 nptv6_find_prefix(struct ip_fw_chain *ch, struct nptv6_cfg *cfg,
     struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	struct in6_ifaddr *ia;
 
 	MPASS(cfg->flags & NPTV6_DYNAMIC_PREFIX);
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	if (ifp == NULL) {
 		ifp = ifunit_ref(cfg->if_name);
 		if (ifp == NULL)
 			return;
 	}
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_INET6)
 			continue;
 		ia = (struct in6_ifaddr *)ifa;
 		if (nptv6_check_prefix(&ia->ia_addr.sin6_addr) ||
 		    IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
 		    &cfg->internal, &cfg->mask))
 			continue;
 		/* Suitable address is found. */
 		nptv6_set_external(cfg, &ia->ia_addr.sin6_addr);
 		break;
 	}
 	NET_EPOCH_EXIT(et);
 	if_rele(ifp);
 }
 
 struct ifaddr_event_args {
 	struct ifnet *ifp;
 	const struct in6_addr *addr;
 	int event;
 };
 
 static int
 ifaddr_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct ifaddr_event_args *args;
 	struct ip_fw_chain *ch;
 	struct nptv6_cfg *cfg;
 
 	ch = &V_layer3_chain;
 	cfg = (struct nptv6_cfg *)SRV_OBJECT(ch, no->kidx);
 	if ((cfg->flags & NPTV6_DYNAMIC_PREFIX) == 0)
 		return (0);
 
 	args = arg;
 	/* If interface name doesn't match, ignore */
 	if (strncmp(args->ifp->if_xname, cfg->if_name, IF_NAMESIZE))
 		return (0);
 	if (args->ifp->if_flags & IFF_DYING) { /* XXX: is it possible? */
 		cfg->flags &= ~NPTV6_READY;
 		return (0);
 	}
 	if (args->event == IFADDR_EVENT_DEL) {
 		/* If instance is not ready, ignore */
 		if ((cfg->flags & NPTV6_READY) == 0)
 			return (0);
 		/* If address does not match the external prefix, ignore */
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&cfg->external, args->addr,
 		    &cfg->mask) != 0)
 			return (0);
 		/* Otherwise clear READY flag */
 		cfg->flags &= ~NPTV6_READY;
 	} else {/* IFADDR_EVENT_ADD */
 		/* If instance is already ready, ignore */
 		if (cfg->flags & NPTV6_READY)
 			return (0);
 		/* If address is not suitable for prefix, ignore */
 		if (nptv6_check_prefix(args->addr) ||
 		    IN6_ARE_MASKED_ADDR_EQUAL(args->addr, &cfg->internal,
 		    &cfg->mask))
 			return (0);
 		/* FALLTHROUGH */
 	}
 	MPASS(!(cfg->flags & NPTV6_READY));
 	/* Try to determine the prefix */
 	if_ref(args->ifp);
 	nptv6_find_prefix(ch, cfg, args->ifp);
 	return (0);
 }
 
 static void
 nptv6_ifaddrevent_handler(void *arg __unused, struct ifnet *ifp,
     struct ifaddr *ifa, int event)
 {
 	struct ifaddr_event_args args;
 	struct ip_fw_chain *ch;
 
 	if (ifa->ifa_addr->sa_family != AF_INET6)
 		return;
 
 	args.ifp = ifp;
 	args.addr = &((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
 	args.event = event;
 
 	ch = &V_layer3_chain;
 	IPFW_UH_WLOCK(ch);
 	ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), ifaddr_cb, &args,
 	    IPFW_TLV_NPTV6_NAME);
 	IPFW_UH_WUNLOCK(ch);
 }
 
 /*
  * Creates new NPTv6 instance.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ipfw_nptv6_cfg ]
  *
  * Returns 0 on success
  */
 static int
 nptv6_create(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	struct in6_addr mask;
 	ipfw_obj_lheader *olh;
 	ipfw_nptv6_cfg *uc;
 	struct namedobj_instance *ni;
 	struct nptv6_cfg *cfg;
 
 	if (sd->valsize != sizeof(*olh) + sizeof(*uc))
 		return (EINVAL);
 
 	olh = (ipfw_obj_lheader *)sd->kbuf;
 	uc = (ipfw_nptv6_cfg *)(olh + 1);
 	if (ipfw_check_object_name_generic(uc->name) != 0)
 		return (EINVAL);
 	if (uc->plen < 8 || uc->plen > 64 || uc->set >= IPFW_MAX_SETS)
 		return (EINVAL);
 	if (nptv6_check_prefix(&uc->internal))
 		return (EINVAL);
 	in6_prefixlen2mask(&mask, uc->plen);
 	if ((uc->flags & NPTV6_DYNAMIC_PREFIX) == 0 && (
 	    nptv6_check_prefix(&uc->external) ||
 	    IN6_ARE_MASKED_ADDR_EQUAL(&uc->external, &uc->internal, &mask)))
 		return (EINVAL);
 
 	ni = CHAIN_TO_SRV(ch);
 	IPFW_UH_RLOCK(ch);
 	if (nptv6_find(ni, uc->name, uc->set) != NULL) {
 		IPFW_UH_RUNLOCK(ch);
 		return (EEXIST);
 	}
 	IPFW_UH_RUNLOCK(ch);
 
 	cfg = nptv6_alloc_config(uc->name, uc->set);
 	cfg->plen = uc->plen;
 	cfg->flags = uc->flags & NPTV6_FLAGSMASK;
 	if (cfg->plen <= 48)
 		cfg->flags |= NPTV6_48PLEN;
 	cfg->mask = mask;
 	cfg->internal = uc->internal;
 	IN6_MASK_ADDR(&cfg->internal, &mask);
 	if (cfg->flags & NPTV6_DYNAMIC_PREFIX)
 		memcpy(cfg->if_name, uc->if_name, IF_NAMESIZE);
 	else
 		nptv6_set_external(cfg, &uc->external);
 
 	if ((uc->flags & NPTV6_DYNAMIC_PREFIX) != 0 &&
 	    nptv6_ifaddr_event == NULL)
 		nptv6_ifaddr_event = EVENTHANDLER_REGISTER(
 		    ifaddr_event_ext, nptv6_ifaddrevent_handler, NULL,
 		    EVENTHANDLER_PRI_ANY);
 
 	IPFW_UH_WLOCK(ch);
 	if (ipfw_objhash_alloc_idx(ni, &cfg->no.kidx) != 0) {
 		IPFW_UH_WUNLOCK(ch);
 		nptv6_free_config(cfg);
 		return (ENOSPC);
 	}
 	ipfw_objhash_add(ni, &cfg->no);
 	SRV_OBJECT(ch, cfg->no.kidx) = cfg;
 	if (cfg->flags & NPTV6_DYNAMIC_PREFIX)
 		nptv6_find_prefix(ch, cfg, NULL);
 	IPFW_UH_WUNLOCK(ch);
 
 	return (0);
 }
 
 /*
  * Destroys NPTv6 instance.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ]
  *
  * Returns 0 on success
  */
 static int
 nptv6_destroy(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct nptv6_cfg *cfg;
 
 	if (sd->valsize != sizeof(*oh))
 		return (EINVAL);
 
 	oh = (ipfw_obj_header *)sd->kbuf;
 	if (ipfw_check_object_name_generic(oh->ntlv.name) != 0)
 		return (EINVAL);
 
 	IPFW_UH_WLOCK(ch);
 	cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
 	if (cfg == NULL) {
 		IPFW_UH_WUNLOCK(ch);
 		return (ESRCH);
 	}
 	if (cfg->no.refcnt > 0) {
 		IPFW_UH_WUNLOCK(ch);
 		return (EBUSY);
 	}
 
 	ipfw_reset_eaction_instance(ch, V_nptv6_eid, cfg->no.kidx);
 	SRV_OBJECT(ch, cfg->no.kidx) = NULL;
 	ipfw_objhash_del(CHAIN_TO_SRV(ch), &cfg->no);
 	ipfw_objhash_free_idx(CHAIN_TO_SRV(ch), cfg->no.kidx);
 	IPFW_UH_WUNLOCK(ch);
 
 	nptv6_free_config(cfg);
 	return (0);
 }
 
 /*
  * Get or change nptv6 instance config.
  * Request: [ ipfw_obj_header [ ipfw_nptv6_cfg ] ]
  */
 static int
 nptv6_config(struct ip_fw_chain *chain, ip_fw3_opheader *op,
     struct sockopt_data *sd)
 {
 
 	return (EOPNOTSUPP);
 }
 
 /*
  * Lists all NPTv6 instances currently available in kernel.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ]
  * Reply: [ ipfw_obj_lheader ipfw_nptv6_cfg x N ]
  *
  * Returns 0 on success
  */
 static int
 nptv6_list(struct ip_fw_chain *ch, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_lheader *olh;
 	struct nptv6_dump_arg da;
 
 	/* Check minimum header size */
 	if (sd->valsize < sizeof(ipfw_obj_lheader))
 		return (EINVAL);
 
 	olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh));
 
 	IPFW_UH_RLOCK(ch);
 	olh->count = ipfw_objhash_count_type(CHAIN_TO_SRV(ch),
 	    IPFW_TLV_NPTV6_NAME);
 	olh->objsize = sizeof(ipfw_nptv6_cfg);
 	olh->size = sizeof(*olh) + olh->count * olh->objsize;
 
 	if (sd->valsize < olh->size) {
 		IPFW_UH_RUNLOCK(ch);
 		return (ENOMEM);
 	}
 	memset(&da, 0, sizeof(da));
 	da.ch = ch;
 	da.sd = sd;
 	ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), export_config_cb,
 	    &da, IPFW_TLV_NPTV6_NAME);
 	IPFW_UH_RUNLOCK(ch);
 
 	return (0);
 }
 
 #define	__COPY_STAT_FIELD(_cfg, _stats, _field)	\
 	(_stats)->_field = NPTV6STAT_FETCH(_cfg, _field)
 static void
 export_stats(struct ip_fw_chain *ch, struct nptv6_cfg *cfg,
     struct ipfw_nptv6_stats *stats)
 {
 
 	__COPY_STAT_FIELD(cfg, stats, in2ex);
 	__COPY_STAT_FIELD(cfg, stats, ex2in);
 	__COPY_STAT_FIELD(cfg, stats, dropped);
 }
 
 /*
  * Get NPTv6 statistics.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ]
  * Reply: [ ipfw_obj_header ipfw_obj_ctlv [ uint64_t x N ]]
  *
  * Returns 0 on success
  */
 static int
 nptv6_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op,
     struct sockopt_data *sd)
 {
 	struct ipfw_nptv6_stats stats;
 	struct nptv6_cfg *cfg;
 	ipfw_obj_header *oh;
 	ipfw_obj_ctlv *ctlv;
 	size_t sz;
 
 	sz = sizeof(ipfw_obj_header) + sizeof(ipfw_obj_ctlv) + sizeof(stats);
 	if (sd->valsize % sizeof(uint64_t))
 		return (EINVAL);
 	if (sd->valsize < sz)
 		return (ENOMEM);
 	oh = (ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
 	if (oh == NULL)
 		return (EINVAL);
 	if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 ||
 	    oh->ntlv.set >= IPFW_MAX_SETS)
 		return (EINVAL);
 	memset(&stats, 0, sizeof(stats));
 
 	IPFW_UH_RLOCK(ch);
 	cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
 	if (cfg == NULL) {
 		IPFW_UH_RUNLOCK(ch);
 		return (ESRCH);
 	}
 	export_stats(ch, cfg, &stats);
 	IPFW_UH_RUNLOCK(ch);
 
 	ctlv = (ipfw_obj_ctlv *)(oh + 1);
 	memset(ctlv, 0, sizeof(*ctlv));
 	ctlv->head.type = IPFW_TLV_COUNTERS;
 	ctlv->head.length = sz - sizeof(ipfw_obj_header);
 	ctlv->count = sizeof(stats) / sizeof(uint64_t);
 	ctlv->objsize = sizeof(uint64_t);
 	ctlv->version = 1;
 	memcpy(ctlv + 1, &stats, sizeof(stats));
 	return (0);
 }
 
 /*
  * Reset NPTv6 statistics.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ]
  *
  * Returns 0 on success
  */
 static int
 nptv6_reset_stats(struct ip_fw_chain *ch, ip_fw3_opheader *op,
     struct sockopt_data *sd)
 {
 	struct nptv6_cfg *cfg;
 	ipfw_obj_header *oh;
 
 	if (sd->valsize != sizeof(*oh))
 		return (EINVAL);
 	oh = (ipfw_obj_header *)sd->kbuf;
 	if (ipfw_check_object_name_generic(oh->ntlv.name) != 0 ||
 	    oh->ntlv.set >= IPFW_MAX_SETS)
 		return (EINVAL);
 
 	IPFW_UH_WLOCK(ch);
 	cfg = nptv6_find(CHAIN_TO_SRV(ch), oh->ntlv.name, oh->ntlv.set);
 	if (cfg == NULL) {
 		IPFW_UH_WUNLOCK(ch);
 		return (ESRCH);
 	}
 	COUNTER_ARRAY_ZERO(cfg->stats, NPTV6STATS);
 	IPFW_UH_WUNLOCK(ch);
 	return (0);
 }
 
 static struct ipfw_sopt_handler	scodes[] = {
 	{ IP_FW_NPTV6_CREATE, 0,	HDIR_SET,	nptv6_create },
 	{ IP_FW_NPTV6_DESTROY,0,	HDIR_SET,	nptv6_destroy },
 	{ IP_FW_NPTV6_CONFIG, 0,	HDIR_BOTH,	nptv6_config },
 	{ IP_FW_NPTV6_LIST,   0,	HDIR_GET,	nptv6_list },
 	{ IP_FW_NPTV6_STATS,  0,	HDIR_GET,	nptv6_stats },
 	{ IP_FW_NPTV6_RESET_STATS,0,	HDIR_SET,	nptv6_reset_stats },
 };
 
 static int
 nptv6_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
 {
 	ipfw_insn *icmd;
 
 	icmd = cmd - 1;
 	NPTV6_DEBUG("opcode %d, arg1 %d, opcode0 %d, arg1 %d",
 	    cmd->opcode, cmd->arg1, icmd->opcode, icmd->arg1);
 	if (icmd->opcode != O_EXTERNAL_ACTION ||
 	    icmd->arg1 != V_nptv6_eid)
 		return (1);
 
 	*puidx = cmd->arg1;
 	*ptype = 0;
 	return (0);
 }
 
 static void
 nptv6_update_arg1(ipfw_insn *cmd, uint16_t idx)
 {
 
 	cmd->arg1 = idx;
 	NPTV6_DEBUG("opcode %d, arg1 -> %d", cmd->opcode, cmd->arg1);
 }
 
 static int
 nptv6_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
     struct named_object **pno)
 {
 	int err;
 
 	err = ipfw_objhash_find_type(CHAIN_TO_SRV(ch), ti,
 	    IPFW_TLV_NPTV6_NAME, pno);
 	NPTV6_DEBUG("uidx %u, type %u, err %d", ti->uidx, ti->type, err);
 	return (err);
 }
 
 static struct named_object *
 nptv6_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
 {
 	struct namedobj_instance *ni;
 	struct named_object *no;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 	ni = CHAIN_TO_SRV(ch);
 	no = ipfw_objhash_lookup_kidx(ni, idx);
 	KASSERT(no != NULL, ("NPT with index %d not found", idx));
 
 	NPTV6_DEBUG("kidx %u -> %s", idx, no->name);
 	return (no);
 }
 
 static int
 nptv6_manage_sets(struct ip_fw_chain *ch, uint16_t set, uint8_t new_set,
     enum ipfw_sets_cmd cmd)
 {
 
 	return (ipfw_obj_manage_sets(CHAIN_TO_SRV(ch), IPFW_TLV_NPTV6_NAME,
 	    set, new_set, cmd));
 }
 
 static struct opcode_obj_rewrite opcodes[] = {
 	{
 		.opcode	= O_EXTERNAL_INSTANCE,
 		.etlv = IPFW_TLV_EACTION /* just show it isn't table */,
 		.classifier = nptv6_classify,
 		.update = nptv6_update_arg1,
 		.find_byname = nptv6_findbyname,
 		.find_bykidx = nptv6_findbykidx,
 		.manage_sets = nptv6_manage_sets,
 	},
 };
 
 static int
 destroy_config_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct nptv6_cfg *cfg;
 	struct ip_fw_chain *ch;
 
 	ch = (struct ip_fw_chain *)arg;
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	cfg = (struct nptv6_cfg *)SRV_OBJECT(ch, no->kidx);
 	SRV_OBJECT(ch, no->kidx) = NULL;
 	ipfw_objhash_del(ni, &cfg->no);
 	ipfw_objhash_free_idx(ni, cfg->no.kidx);
 	nptv6_free_config(cfg);
 	return (0);
 }
 
 int
 nptv6_init(struct ip_fw_chain *ch, int first)
 {
 
 	V_nptv6_eid = ipfw_add_eaction(ch, ipfw_nptv6, "nptv6");
 	if (V_nptv6_eid == 0)
 		return (ENXIO);
 	IPFW_ADD_SOPT_HANDLER(first, scodes);
 	IPFW_ADD_OBJ_REWRITER(first, opcodes);
 	return (0);
 }
 
 void
 nptv6_uninit(struct ip_fw_chain *ch, int last)
 {
 
 	if (last && nptv6_ifaddr_event != NULL)
 		EVENTHANDLER_DEREGISTER(ifaddr_event_ext, nptv6_ifaddr_event);
 	IPFW_DEL_OBJ_REWRITER(last, opcodes);
 	IPFW_DEL_SOPT_HANDLER(last, scodes);
 	ipfw_del_eaction(ch, V_nptv6_eid);
 	/*
 	 * Since we already have deregistered external action,
 	 * our named objects become unaccessible via rules, because
 	 * all rules were truncated by ipfw_del_eaction().
 	 * So, we can unlink and destroy our named objects without holding
 	 * IPFW_WLOCK().
 	 */
 	IPFW_UH_WLOCK(ch);
 	ipfw_objhash_foreach_type(CHAIN_TO_SRV(ch), destroy_config_cb, ch,
 	    IPFW_TLV_NPTV6_NAME);
 	V_nptv6_eid = 0;
 	IPFW_UH_WUNLOCK(ch);
 }
diff --git a/sys/netpfil/pf/if_pflog.c b/sys/netpfil/pf/if_pflog.c
index b4f34cd13fba..2f687e901a71 100644
--- a/sys/netpfil/pf/if_pflog.c
+++ b/sys/netpfil/pf/if_pflog.c
@@ -1,336 +1,337 @@
 /*-
  * SPDX-License-Identifier: ISC
  *
  * The authors of this code are John Ioannidis (ji@tla.org),
  * Angelos D. Keromytis (kermit@csd.uch.gr) and
  * Niels Provos (provos@physnet.uni-hamburg.de).
  *
  * This code was written by John Ioannidis for BSD/OS in Athens, Greece,
  * in November 1995.
  *
  * Ported to OpenBSD and NetBSD, with additional transforms, in December 1996,
  * by Angelos D. Keromytis.
  *
  * Additional transforms and features in 1997 and 1998 by Angelos D. Keromytis
  * and Niels Provos.
  *
  * Copyright (C) 1995, 1996, 1997, 1998 by John Ioannidis, Angelos D. Keromytis
  * and Niels Provos.
  * Copyright (c) 2001, Angelos D. Keromytis, Niels Provos.
  *
  * Permission to use, copy, and modify this software with or without fee
  * is hereby granted, provided that this entire notice is included in
  * all copies of any software which is or includes a copy or
  * modification of this software.
  * You may use this code under the GNU public license if you so wish. Please
  * contribute changes back to the authors under this freer than GPL license
  * so that we may further the use of strong encryption without limitations to
  * all.
  *
  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR
  * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY
  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE
  * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR
  * PURPOSE.
  *
  *	$OpenBSD: if_pflog.c,v 1.26 2007/10/18 21:58:18 mpf Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_bpf.h"
 #include "opt_pf.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_pflog.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/vnet.h>
 #include <net/pfvar.h>
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #endif
 #ifdef	INET
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #endif
 
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 #endif /* INET6 */
 
 #ifdef INET
 #include <machine/in_cksum.h>
 #endif /* INET */
 
 #define PFLOGMTU	(32768 + MHLEN + MLEN)
 
 #ifdef PFLOGDEBUG
 #define DPRINTF(x)    do { if (pflogdebug) printf x ; } while (0)
 #else
 #define DPRINTF(x)
 #endif
 
 static int	pflogoutput(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static void	pflogattach(int);
 static int	pflogioctl(struct ifnet *, u_long, caddr_t);
 static void	pflogstart(struct ifnet *);
 static int	pflog_clone_create(struct if_clone *, char *, size_t,
 		    struct ifc_data *, struct ifnet **);
 static int	pflog_clone_destroy(struct if_clone *, struct ifnet *, uint32_t);
 
 static const char pflogname[] = "pflog";
 
 VNET_DEFINE_STATIC(struct if_clone *, pflog_cloner);
 #define	V_pflog_cloner		VNET(pflog_cloner)
 
 VNET_DEFINE(struct ifnet *, pflogifs[PFLOGIFS_MAX]);	/* for fast access */
 #define	V_pflogifs		VNET(pflogifs)
 
 static void
 pflogattach(int npflog __unused)
 {
 	int	i;
 	for (i = 0; i < PFLOGIFS_MAX; i++)
 		V_pflogifs[i] = NULL;
 
 	struct if_clone_addreq req = {
 		.create_f = pflog_clone_create,
 		.destroy_f = pflog_clone_destroy,
 		.flags = IFC_F_AUTOUNIT,
 	};
 	V_pflog_cloner = ifc_attach_cloner(pflogname, &req);
 	struct ifc_data ifd = { .unit = 0 };
 	ifc_create_ifp(pflogname, &ifd, NULL);
 }
 
 static int
 pflog_clone_create(struct if_clone *ifc, char *name, size_t maxlen,
     struct ifc_data *ifd, struct ifnet **ifpp)
 {
 	struct ifnet *ifp;
 
 	if (ifd->unit >= PFLOGIFS_MAX)
 		return (EINVAL);
 
 	ifp = if_alloc(IFT_PFLOG);
 	if (ifp == NULL) {
 		return (ENOSPC);
 	}
 	if_initname(ifp, pflogname, ifd->unit);
 	ifp->if_mtu = PFLOGMTU;
 	ifp->if_ioctl = pflogioctl;
 	ifp->if_output = pflogoutput;
 	ifp->if_start = pflogstart;
 	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	ifp->if_hdrlen = PFLOG_HDRLEN;
 	if_attach(ifp);
 
 	bpfattach(ifp, DLT_PFLOG, PFLOG_HDRLEN);
 
 	V_pflogifs[ifd->unit] = ifp;
 	*ifpp = ifp;
 
 	return (0);
 }
 
 static int
 pflog_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
 {
 	int i;
 
 	if (ifp->if_dunit == 0 && (flags & IFC_F_FORCE) == 0)
 		return (EINVAL);
 
 	for (i = 0; i < PFLOGIFS_MAX; i++)
 		if (V_pflogifs[i] == ifp)
 			V_pflogifs[i] = NULL;
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 	if_free(ifp);
 
 	return (0);
 }
 
 /*
  * Start output on the pflog interface.
  */
 static void
 pflogstart(struct ifnet *ifp)
 {
 	struct mbuf *m;
 
 	for (;;) {
 		IF_LOCK(&ifp->if_snd);
 		_IF_DEQUEUE(&ifp->if_snd, m);
 		IF_UNLOCK(&ifp->if_snd);
 
 		if (m == NULL)
 			return;
 		else
 			m_freem(m);
 	}
 }
 
 static int
 pflogoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *rt)
 {
 	m_freem(m);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 pflogioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	switch (cmd) {
 	case SIOCSIFFLAGS:
 		if (ifp->if_flags & IFF_UP)
 			ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		else
 			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 		break;
 	default:
 		return (ENOTTY);
 	}
 
 	return (0);
 }
 
 static int
 pflog_packet(struct pfi_kkif *kif, struct mbuf *m, sa_family_t af, u_int8_t dir,
     u_int8_t reason, struct pf_krule *rm, struct pf_krule *am,
     struct pf_kruleset *ruleset, struct pf_pdesc *pd, int lookupsafe)
 {
 	struct ifnet *ifn;
 	struct pfloghdr hdr;
 
 	if (kif == NULL || m == NULL || rm == NULL || pd == NULL)
 		return ( 1);
 
 	if ((ifn = V_pflogifs[rm->logif]) == NULL || !ifn->if_bpf)
 		return (0);
 
 	bzero(&hdr, sizeof(hdr));
 	hdr.length = PFLOG_REAL_HDRLEN;
 	hdr.af = af;
 	hdr.action = rm->action;
 	hdr.reason = reason;
 	memcpy(hdr.ifname, kif->pfik_name, sizeof(hdr.ifname));
 
 	if (am == NULL) {
 		hdr.rulenr = htonl(rm->nr);
 		hdr.subrulenr = -1;
 	} else {
 		hdr.rulenr = htonl(am->nr);
 		hdr.subrulenr = htonl(rm->nr);
 		if (ruleset != NULL && ruleset->anchor != NULL)
 			strlcpy(hdr.ruleset, ruleset->anchor->name,
 			    sizeof(hdr.ruleset));
 	}
 	hdr.ridentifier = htonl(rm->ridentifier);
 	/*
 	 * XXXGL: we avoid pf_socket_lookup() when we are holding
 	 * state lock, since this leads to unsafe LOR.
 	 * These conditions are very very rare, however.
 	 */
 	if (rm->log & PF_LOG_SOCKET_LOOKUP && !pd->lookup.done && lookupsafe)
 		pd->lookup.done = pf_socket_lookup(dir, pd, m);
 	if (pd->lookup.done > 0)
 		hdr.uid = pd->lookup.uid;
 	else
 		hdr.uid = UID_MAX;
 	hdr.pid = NO_PID;
 	hdr.rule_uid = rm->cuid;
 	hdr.rule_pid = rm->cpid;
 	hdr.dir = dir;
 
 #ifdef INET
 	if (af == AF_INET && dir == PF_OUT) {
 		struct ip *ip;
 
 		ip = mtod(m, struct ip *);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(m, ip->ip_hl << 2);
 	}
 #endif /* INET */
 
 	if_inc_counter(ifn, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(ifn, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 	BPF_MTAP2(ifn, &hdr, PFLOG_HDRLEN, m);
 
 	return (0);
 }
 
 static void
 vnet_pflog_init(const void *unused __unused)
 {
 
 	pflogattach(1);
 }
 VNET_SYSINIT(vnet_pflog_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY,
     vnet_pflog_init, NULL);
 
 static void
 vnet_pflog_uninit(const void *unused __unused)
 {
 
 	ifc_detach_cloner(V_pflog_cloner);
 }
 /*
  * Detach after pf is gone; otherwise we might touch pflog memory
  * from within pf after freeing pflog.
  */
 VNET_SYSUNINIT(vnet_pflog_uninit, SI_SUB_INIT_IF, SI_ORDER_SECOND,
     vnet_pflog_uninit, NULL);
 
 static int
 pflog_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		PF_RULES_WLOCK();
 		pflog_packet_ptr = pflog_packet;
 		PF_RULES_WUNLOCK();
 		break;
 	case MOD_UNLOAD:
 		PF_RULES_WLOCK();
 		pflog_packet_ptr = NULL;
 		PF_RULES_WUNLOCK();
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return error;
 }
 
 static moduledata_t pflog_mod = { pflogname, pflog_modevent, 0 };
 
 #define PFLOG_MODVER 1
 
 /* Do not run before pf is initialized as we depend on its locks. */
 DECLARE_MODULE(pflog, pflog_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
 MODULE_VERSION(pflog, PFLOG_MODVER);
 MODULE_DEPEND(pflog, pf, PF_MODVER, PF_MODVER, PF_MODVER);
diff --git a/sys/netpfil/pf/if_pfsync.c b/sys/netpfil/pf/if_pfsync.c
index 61308a35a7e1..3aa9bbfb633f 100644
--- a/sys/netpfil/pf/if_pfsync.c
+++ b/sys/netpfil/pf/if_pfsync.c
@@ -1,2699 +1,2700 @@
 /*-
  * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND ISC)
  *
  * Copyright (c) 2002 Michael Shalayeff
  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 2009 David Gwynne <dlg@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
 /*
  * $OpenBSD: if_pfsync.c,v 1.110 2009/02/24 05:39:19 dlg Exp $
  *
  * Revisions picked from OpenBSD after revision 1.110 import:
  * 1.119 - don't m_copydata() beyond the len of mbuf in pfsync_input()
  * 1.118, 1.124, 1.148, 1.149, 1.151, 1.171 - fixes to bulk updates
  * 1.120, 1.175 - use monotonic time_uptime
  * 1.122 - reduce number of updates for non-TCP sessions
  * 1.125, 1.127 - rewrite merge or stale processing
  * 1.128 - cleanups
  * 1.146 - bzero() mbuf before sparsely filling it with data
  * 1.170 - SIOCSIFMTU checks
  * 1.126, 1.142 - deferred packets processing
  * 1.173 - correct expire time processing
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_pf.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/nv.h>
 #include <sys/priv.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/vnet.h>
 #include <net/pfvar.h>
 #include <net/if_pfsync.h>
 
 #include <netinet/if_ether.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 
 #include <netpfil/pf/pfsync_nv.h>
 
 struct pfsync_bucket;
 
 union inet_template {
 	struct ip      ipv4;
 };
 
 #define PFSYNC_MINPKT ( \
 	sizeof(union inet_template) + \
 	sizeof(struct pfsync_header) + \
 	sizeof(struct pfsync_subheader) )
 
 static int	pfsync_upd_tcp(struct pf_kstate *, struct pfsync_state_peer *,
 		    struct pfsync_state_peer *);
 static int	pfsync_in_clr(struct mbuf *, int, int, int);
 static int	pfsync_in_ins(struct mbuf *, int, int, int);
 static int	pfsync_in_iack(struct mbuf *, int, int, int);
 static int	pfsync_in_upd(struct mbuf *, int, int, int);
 static int	pfsync_in_upd_c(struct mbuf *, int, int, int);
 static int	pfsync_in_ureq(struct mbuf *, int, int, int);
 static int	pfsync_in_del(struct mbuf *, int, int, int);
 static int	pfsync_in_del_c(struct mbuf *, int, int, int);
 static int	pfsync_in_bus(struct mbuf *, int, int, int);
 static int	pfsync_in_tdb(struct mbuf *, int, int, int);
 static int	pfsync_in_eof(struct mbuf *, int, int, int);
 static int	pfsync_in_error(struct mbuf *, int, int, int);
 
 static int (*pfsync_acts[])(struct mbuf *, int, int, int) = {
 	pfsync_in_clr,			/* PFSYNC_ACT_CLR */
 	pfsync_in_ins,			/* PFSYNC_ACT_INS */
 	pfsync_in_iack,			/* PFSYNC_ACT_INS_ACK */
 	pfsync_in_upd,			/* PFSYNC_ACT_UPD */
 	pfsync_in_upd_c,		/* PFSYNC_ACT_UPD_C */
 	pfsync_in_ureq,			/* PFSYNC_ACT_UPD_REQ */
 	pfsync_in_del,			/* PFSYNC_ACT_DEL */
 	pfsync_in_del_c,		/* PFSYNC_ACT_DEL_C */
 	pfsync_in_error,		/* PFSYNC_ACT_INS_F */
 	pfsync_in_error,		/* PFSYNC_ACT_DEL_F */
 	pfsync_in_bus,			/* PFSYNC_ACT_BUS */
 	pfsync_in_tdb,			/* PFSYNC_ACT_TDB */
 	pfsync_in_eof			/* PFSYNC_ACT_EOF */
 };
 
 struct pfsync_q {
 	void		(*write)(struct pf_kstate *, void *);
 	size_t		len;
 	u_int8_t	action;
 };
 
 /* we have one of these for every PFSYNC_S_ */
 static void	pfsync_out_state(struct pf_kstate *, void *);
 static void	pfsync_out_iack(struct pf_kstate *, void *);
 static void	pfsync_out_upd_c(struct pf_kstate *, void *);
 static void	pfsync_out_del(struct pf_kstate *, void *);
 
 static struct pfsync_q pfsync_qs[] = {
 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_INS },
 	{ pfsync_out_iack,  sizeof(struct pfsync_ins_ack), PFSYNC_ACT_INS_ACK },
 	{ pfsync_out_state, sizeof(struct pfsync_state),   PFSYNC_ACT_UPD },
 	{ pfsync_out_upd_c, sizeof(struct pfsync_upd_c),   PFSYNC_ACT_UPD_C },
 	{ pfsync_out_del,   sizeof(struct pfsync_del_c),   PFSYNC_ACT_DEL_C }
 };
 
 static void	pfsync_q_ins(struct pf_kstate *, int, bool);
 static void	pfsync_q_del(struct pf_kstate *, bool, struct pfsync_bucket *);
 
 static void	pfsync_update_state(struct pf_kstate *);
 
 struct pfsync_upd_req_item {
 	TAILQ_ENTRY(pfsync_upd_req_item)	ur_entry;
 	struct pfsync_upd_req			ur_msg;
 };
 
 struct pfsync_deferral {
 	struct pfsync_softc		*pd_sc;
 	TAILQ_ENTRY(pfsync_deferral)	pd_entry;
 	u_int				pd_refs;
 	struct callout			pd_tmo;
 
 	struct pf_kstate		*pd_st;
 	struct mbuf			*pd_m;
 };
 
 struct pfsync_sofct;
 
 struct pfsync_bucket
 {
 	int			b_id;
 	struct pfsync_softc	*b_sc;
 	struct mtx		b_mtx;
 	struct callout		b_tmo;
 	int			b_flags;
 #define	PFSYNCF_BUCKET_PUSH	0x00000001
 
 	size_t			b_len;
 	TAILQ_HEAD(, pf_kstate)			b_qs[PFSYNC_S_COUNT];
 	TAILQ_HEAD(, pfsync_upd_req_item)	b_upd_req_list;
 	TAILQ_HEAD(, pfsync_deferral)		b_deferrals;
 	u_int			b_deferred;
 	void			*b_plus;
 	size_t			b_pluslen;
 
 	struct  ifaltq b_snd;
 };
 
 struct pfsync_softc {
 	/* Configuration */
 	struct ifnet		*sc_ifp;
 	struct ifnet		*sc_sync_if;
 	struct ip_moptions	sc_imo;
 	struct sockaddr_storage	sc_sync_peer;
 	uint32_t		sc_flags;
 	uint8_t			sc_maxupdates;
 	union inet_template     sc_template;
 	struct mtx		sc_mtx;
 
 	/* Queued data */
 	struct pfsync_bucket	*sc_buckets;
 
 	/* Bulk update info */
 	struct mtx		sc_bulk_mtx;
 	uint32_t		sc_ureq_sent;
 	int			sc_bulk_tries;
 	uint32_t		sc_ureq_received;
 	int			sc_bulk_hashid;
 	uint64_t		sc_bulk_stateid;
 	uint32_t		sc_bulk_creatorid;
 	struct callout		sc_bulk_tmo;
 	struct callout		sc_bulkfail_tmo;
 };
 
 #define	PFSYNC_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
 #define	PFSYNC_UNLOCK(sc)	mtx_unlock(&(sc)->sc_mtx)
 #define	PFSYNC_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
 
 #define PFSYNC_BUCKET_LOCK(b)		mtx_lock(&(b)->b_mtx)
 #define PFSYNC_BUCKET_UNLOCK(b)		mtx_unlock(&(b)->b_mtx)
 #define PFSYNC_BUCKET_LOCK_ASSERT(b)	mtx_assert(&(b)->b_mtx, MA_OWNED)
 
 #define	PFSYNC_BLOCK(sc)	mtx_lock(&(sc)->sc_bulk_mtx)
 #define	PFSYNC_BUNLOCK(sc)	mtx_unlock(&(sc)->sc_bulk_mtx)
 #define	PFSYNC_BLOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
 
 static const char pfsyncname[] = "pfsync";
 static MALLOC_DEFINE(M_PFSYNC, pfsyncname, "pfsync(4) data");
 VNET_DEFINE_STATIC(struct pfsync_softc	*, pfsyncif) = NULL;
 #define	V_pfsyncif		VNET(pfsyncif)
 VNET_DEFINE_STATIC(void *, pfsync_swi_cookie) = NULL;
 #define	V_pfsync_swi_cookie	VNET(pfsync_swi_cookie)
 VNET_DEFINE_STATIC(struct intr_event *, pfsync_swi_ie);
 #define	V_pfsync_swi_ie		VNET(pfsync_swi_ie)
 VNET_DEFINE_STATIC(struct pfsyncstats, pfsyncstats);
 #define	V_pfsyncstats		VNET(pfsyncstats)
 VNET_DEFINE_STATIC(int, pfsync_carp_adj) = CARP_MAXSKEW;
 #define	V_pfsync_carp_adj	VNET(pfsync_carp_adj)
 
 static void	pfsync_timeout(void *);
 static void	pfsync_push(struct pfsync_bucket *);
 static void	pfsync_push_all(struct pfsync_softc *);
 static void	pfsyncintr(void *);
 static int	pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
 		    struct in_mfilter *imf);
 static void	pfsync_multicast_cleanup(struct pfsync_softc *);
 static void	pfsync_pointers_init(void);
 static void	pfsync_pointers_uninit(void);
 static int	pfsync_init(void);
 static void	pfsync_uninit(void);
 
 static unsigned long pfsync_buckets;
 
 SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "PFSYNC");
 SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(pfsyncstats), pfsyncstats,
     "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
 SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
 SYSCTL_ULONG(_net_pfsync, OID_AUTO, pfsync_buckets, CTLFLAG_RDTUN,
     &pfsync_buckets, 0, "Number of pfsync hash buckets");
 
 static int	pfsync_clone_create(struct if_clone *, int, caddr_t);
 static void	pfsync_clone_destroy(struct ifnet *);
 static int	pfsync_alloc_scrub_memory(struct pfsync_state_peer *,
 		    struct pf_state_peer *);
 static int	pfsyncoutput(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
 static int	pfsyncioctl(struct ifnet *, u_long, caddr_t);
 
 static int	pfsync_defer(struct pf_kstate *, struct mbuf *);
 static void	pfsync_undefer(struct pfsync_deferral *, int);
 static void	pfsync_undefer_state(struct pf_kstate *, int);
 static void	pfsync_defer_tmo(void *);
 
 static void	pfsync_request_update(u_int32_t, u_int64_t);
 static bool	pfsync_update_state_req(struct pf_kstate *);
 
 static void	pfsync_drop(struct pfsync_softc *);
 static void	pfsync_sendout(int, int);
 static void	pfsync_send_plus(void *, size_t);
 
 static void	pfsync_bulk_start(void);
 static void	pfsync_bulk_status(u_int8_t);
 static void	pfsync_bulk_update(void *);
 static void	pfsync_bulk_fail(void *);
 
 static void	pfsync_detach_ifnet(struct ifnet *);
 
 static int pfsync_pfsyncreq_to_kstatus(struct pfsyncreq *,
     struct pfsync_kstatus *);
 static int pfsync_kstatus_to_softc(struct pfsync_kstatus *,
     struct pfsync_softc *);
 
 #ifdef IPSEC
 static void	pfsync_update_net_tdb(struct pfsync_tdb *);
 #endif
 static struct pfsync_bucket	*pfsync_get_bucket(struct pfsync_softc *,
 		    struct pf_kstate *);
 
 #define PFSYNC_MAX_BULKTRIES	12
 #define PFSYNC_DEFER_TIMEOUT	((20 * hz) / 1000)
 
 VNET_DEFINE(struct if_clone *, pfsync_cloner);
 #define	V_pfsync_cloner	VNET(pfsync_cloner)
 
 static int
 pfsync_clone_create(struct if_clone *ifc, int unit, caddr_t param)
 {
 	struct pfsync_softc *sc;
 	struct ifnet *ifp;
 	struct pfsync_bucket *b;
 	int c, q;
 
 	if (unit != 0)
 		return (EINVAL);
 
 	if (! pfsync_buckets)
 		pfsync_buckets = mp_ncpus * 2;
 
 	sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
 	sc->sc_flags |= PFSYNCF_OK;
 	sc->sc_maxupdates = 128;
 
 	ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
 	if (ifp == NULL) {
 		free(sc, M_PFSYNC);
 		return (ENOSPC);
 	}
 	if_initname(ifp, pfsyncname, unit);
 	ifp->if_softc = sc;
 	ifp->if_ioctl = pfsyncioctl;
 	ifp->if_output = pfsyncoutput;
 	ifp->if_type = IFT_PFSYNC;
 	ifp->if_hdrlen = sizeof(struct pfsync_header);
 	ifp->if_mtu = ETHERMTU;
 	mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
 	mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
 	callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
 	callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
 
 	if_attach(ifp);
 
 	bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
 
 	sc->sc_buckets = mallocarray(pfsync_buckets, sizeof(*sc->sc_buckets),
 	    M_PFSYNC, M_ZERO | M_WAITOK);
 	for (c = 0; c < pfsync_buckets; c++) {
 		b = &sc->sc_buckets[c];
 		mtx_init(&b->b_mtx, "pfsync bucket", NULL, MTX_DEF);
 
 		b->b_id = c;
 		b->b_sc = sc;
 		b->b_len = PFSYNC_MINPKT;
 
 		for (q = 0; q < PFSYNC_S_COUNT; q++)
 			TAILQ_INIT(&b->b_qs[q]);
 
 		TAILQ_INIT(&b->b_upd_req_list);
 		TAILQ_INIT(&b->b_deferrals);
 
 		callout_init(&b->b_tmo, 1);
 
 		b->b_snd.ifq_maxlen = ifqmaxlen;
 	}
 
 	V_pfsyncif = sc;
 
 	return (0);
 }
 
 static void
 pfsync_clone_destroy(struct ifnet *ifp)
 {
 	struct pfsync_softc *sc = ifp->if_softc;
 	struct pfsync_bucket *b;
 	int c;
 
 	for (c = 0; c < pfsync_buckets; c++) {
 		b = &sc->sc_buckets[c];
 		/*
 		 * At this stage, everything should have already been
 		 * cleared by pfsync_uninit(), and we have only to
 		 * drain callouts.
 		 */
 		while (b->b_deferred > 0) {
 			struct pfsync_deferral *pd =
 			    TAILQ_FIRST(&b->b_deferrals);
 
 			TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
 			b->b_deferred--;
 			if (callout_stop(&pd->pd_tmo) > 0) {
 				pf_release_state(pd->pd_st);
 				m_freem(pd->pd_m);
 				free(pd, M_PFSYNC);
 			} else {
 				pd->pd_refs++;
 				callout_drain(&pd->pd_tmo);
 				free(pd, M_PFSYNC);
 			}
 		}
 
 		callout_drain(&b->b_tmo);
 	}
 
 	callout_drain(&sc->sc_bulkfail_tmo);
 	callout_drain(&sc->sc_bulk_tmo);
 
 	if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 		(*carp_demote_adj_p)(-V_pfsync_carp_adj, "pfsync destroy");
 	bpfdetach(ifp);
 	if_detach(ifp);
 
 	pfsync_drop(sc);
 
 	if_free(ifp);
 	pfsync_multicast_cleanup(sc);
 	mtx_destroy(&sc->sc_mtx);
 	mtx_destroy(&sc->sc_bulk_mtx);
 
 	free(sc->sc_buckets, M_PFSYNC);
 	free(sc, M_PFSYNC);
 
 	V_pfsyncif = NULL;
 }
 
 static int
 pfsync_alloc_scrub_memory(struct pfsync_state_peer *s,
     struct pf_state_peer *d)
 {
 	if (s->scrub.scrub_flag && d->scrub == NULL) {
 		d->scrub = uma_zalloc(V_pf_state_scrub_z, M_NOWAIT | M_ZERO);
 		if (d->scrub == NULL)
 			return (ENOMEM);
 	}
 
 	return (0);
 }
 
 static int
 pfsync_state_import(struct pfsync_state *sp, int flags)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 #ifndef	__NO_STRICT_ALIGNMENT
 	struct pfsync_state_key key[2];
 #endif
 	struct pfsync_state_key *kw, *ks;
 	struct pf_kstate	*st = NULL;
 	struct pf_state_key *skw = NULL, *sks = NULL;
 	struct pf_krule *r = NULL;
 	struct pfi_kkif	*kif;
 	int error;
 
 	PF_RULES_RASSERT();
 
 	if (sp->creatorid == 0) {
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("%s: invalid creator id: %08x\n", __func__,
 			    ntohl(sp->creatorid));
 		return (EINVAL);
 	}
 
 	if ((kif = pfi_kkif_find(sp->ifname)) == NULL) {
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("%s: unknown interface: %s\n", __func__,
 			    sp->ifname);
 		if (flags & PFSYNC_SI_IOCTL)
 			return (EINVAL);
 		return (0);	/* skip this state */
 	}
 
 	/*
 	 * If the ruleset checksums match or the state is coming from the ioctl,
 	 * it's safe to associate the state with the rule of that number.
 	 */
 	if (sp->rule != htonl(-1) && sp->anchor == htonl(-1) &&
 	    (flags & (PFSYNC_SI_IOCTL | PFSYNC_SI_CKSUM)) && ntohl(sp->rule) <
 	    pf_main_ruleset.rules[PF_RULESET_FILTER].active.rcount)
 		r = pf_main_ruleset.rules[
 		    PF_RULESET_FILTER].active.ptr_array[ntohl(sp->rule)];
 	else
 		r = &V_pf_default_rule;
 
 	if ((r->max_states &&
 	    counter_u64_fetch(r->states_cur) >= r->max_states))
 		goto cleanup;
 
 	/*
 	 * XXXGL: consider M_WAITOK in ioctl path after.
 	 */
 	st = pf_alloc_state(M_NOWAIT);
 	if (__predict_false(st == NULL))
 		goto cleanup;
 
 	if ((skw = uma_zalloc(V_pf_state_key_z, M_NOWAIT)) == NULL)
 		goto cleanup;
 
 #ifndef	__NO_STRICT_ALIGNMENT
 	bcopy(&sp->key, key, sizeof(struct pfsync_state_key) * 2);
 	kw = &key[PF_SK_WIRE];
 	ks = &key[PF_SK_STACK];
 #else
 	kw = &sp->key[PF_SK_WIRE];
 	ks = &sp->key[PF_SK_STACK];
 #endif
 
 	if (PF_ANEQ(&kw->addr[0], &ks->addr[0], sp->af) ||
 	    PF_ANEQ(&kw->addr[1], &ks->addr[1], sp->af) ||
 	    kw->port[0] != ks->port[0] ||
 	    kw->port[1] != ks->port[1]) {
 		sks = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
 		if (sks == NULL)
 			goto cleanup;
 	} else
 		sks = skw;
 
 	/* allocate memory for scrub info */
 	if (pfsync_alloc_scrub_memory(&sp->src, &st->src) ||
 	    pfsync_alloc_scrub_memory(&sp->dst, &st->dst))
 		goto cleanup;
 
 	/* Copy to state key(s). */
 	skw->addr[0] = kw->addr[0];
 	skw->addr[1] = kw->addr[1];
 	skw->port[0] = kw->port[0];
 	skw->port[1] = kw->port[1];
 	skw->proto = sp->proto;
 	skw->af = sp->af;
 	if (sks != skw) {
 		sks->addr[0] = ks->addr[0];
 		sks->addr[1] = ks->addr[1];
 		sks->port[0] = ks->port[0];
 		sks->port[1] = ks->port[1];
 		sks->proto = sp->proto;
 		sks->af = sp->af;
 	}
 
 	/* copy to state */
 	bcopy(&sp->rt_addr, &st->rt_addr, sizeof(st->rt_addr));
 	st->creation = time_uptime - ntohl(sp->creation);
 	st->expire = time_uptime;
 	if (sp->expire) {
 		uint32_t timeout;
 
 		timeout = r->timeout[sp->timeout];
 		if (!timeout)
 			timeout = V_pf_default_rule.timeout[sp->timeout];
 
 		/* sp->expire may have been adaptively scaled by export. */
 		st->expire -= timeout - ntohl(sp->expire);
 	}
 
 	st->direction = sp->direction;
 	st->log = sp->log;
 	st->timeout = sp->timeout;
 	st->state_flags = sp->state_flags;
 
 	st->id = sp->id;
 	st->creatorid = sp->creatorid;
 	pf_state_peer_ntoh(&sp->src, &st->src);
 	pf_state_peer_ntoh(&sp->dst, &st->dst);
 
 	st->rule.ptr = r;
 	st->nat_rule.ptr = NULL;
 	st->anchor.ptr = NULL;
 	st->rt_kif = NULL;
 
 	st->pfsync_time = time_uptime;
 	st->sync_state = PFSYNC_S_NONE;
 
 	if (!(flags & PFSYNC_SI_IOCTL))
 		st->state_flags |= PFSTATE_NOSYNC;
 
 	if ((error = pf_state_insert(kif, kif, skw, sks, st)) != 0)
 		goto cleanup_state;
 
 	/* XXX when we have nat_rule/anchors, use STATE_INC_COUNTERS */
 	counter_u64_add(r->states_cur, 1);
 	counter_u64_add(r->states_tot, 1);
 
 	if (!(flags & PFSYNC_SI_IOCTL)) {
 		st->state_flags &= ~PFSTATE_NOSYNC;
 		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_q_ins(st, PFSYNC_S_IACK, true);
 			pfsync_push_all(sc);
 		}
 	}
 	st->state_flags &= ~PFSTATE_ACK;
 	PF_STATE_UNLOCK(st);
 
 	return (0);
 
 cleanup:
 	error = ENOMEM;
 	if (skw == sks)
 		sks = NULL;
 	if (skw != NULL)
 		uma_zfree(V_pf_state_key_z, skw);
 	if (sks != NULL)
 		uma_zfree(V_pf_state_key_z, sks);
 
 cleanup_state:	/* pf_state_insert() frees the state keys. */
 	if (st) {
 		st->timeout = PFTM_UNLINKED; /* appease an assert */
 		pf_free_state(st);
 	}
 	return (error);
 }
 
 #ifdef INET
 static int
 pfsync_input(struct mbuf **mp, int *offp __unused, int proto __unused)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct mbuf *m = *mp;
 	struct ip *ip = mtod(m, struct ip *);
 	struct pfsync_header *ph;
 	struct pfsync_subheader subh;
 
 	int offset, len, flags = 0;
 	int rv;
 	uint16_t count;
 
 	PF_RULES_RLOCK_TRACKER;
 
 	*mp = NULL;
 	V_pfsyncstats.pfsyncs_ipackets++;
 
 	/* Verify that we have a sync interface configured. */
 	if (!sc || !sc->sc_sync_if || !V_pf_status.running ||
 	    (sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		goto done;
 
 	/* verify that the packet came in on the right interface */
 	if (sc->sc_sync_if != m->m_pkthdr.rcvif) {
 		V_pfsyncstats.pfsyncs_badif++;
 		goto done;
 	}
 
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_IPACKETS, 1);
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	/* verify that the IP TTL is 255. */
 	if (ip->ip_ttl != PFSYNC_DFLTTL) {
 		V_pfsyncstats.pfsyncs_badttl++;
 		goto done;
 	}
 
 	offset = ip->ip_hl << 2;
 	if (m->m_pkthdr.len < offset + sizeof(*ph)) {
 		V_pfsyncstats.pfsyncs_hdrops++;
 		goto done;
 	}
 
 	if (offset + sizeof(*ph) > m->m_len) {
 		if (m_pullup(m, offset + sizeof(*ph)) == NULL) {
 			V_pfsyncstats.pfsyncs_hdrops++;
 			return (IPPROTO_DONE);
 		}
 		ip = mtod(m, struct ip *);
 	}
 	ph = (struct pfsync_header *)((char *)ip + offset);
 
 	/* verify the version */
 	if (ph->version != PFSYNC_VERSION) {
 		V_pfsyncstats.pfsyncs_badver++;
 		goto done;
 	}
 
 	len = ntohs(ph->len) + offset;
 	if (m->m_pkthdr.len < len) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		goto done;
 	}
 
 	/*
 	 * Trusting pf_chksum during packet processing, as well as seeking
 	 * in interface name tree, require holding PF_RULES_RLOCK().
 	 */
 	PF_RULES_RLOCK();
 	if (!bcmp(&ph->pfcksum, &V_pf_status.pf_chksum, PF_MD5_DIGEST_LENGTH))
 		flags = PFSYNC_SI_CKSUM;
 
 	offset += sizeof(*ph);
 	while (offset <= len - sizeof(subh)) {
 		m_copydata(m, offset, sizeof(subh), (caddr_t)&subh);
 		offset += sizeof(subh);
 
 		if (subh.action >= PFSYNC_ACT_MAX) {
 			V_pfsyncstats.pfsyncs_badact++;
 			PF_RULES_RUNLOCK();
 			goto done;
 		}
 
 		count = ntohs(subh.count);
 		V_pfsyncstats.pfsyncs_iacts[subh.action] += count;
 		rv = (*pfsync_acts[subh.action])(m, offset, count, flags);
 		if (rv == -1) {
 			PF_RULES_RUNLOCK();
 			return (IPPROTO_DONE);
 		}
 
 		offset += rv;
 	}
 	PF_RULES_RUNLOCK();
 
 done:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 #endif
 
 static int
 pfsync_in_clr(struct mbuf *m, int offset, int count, int flags)
 {
 	struct pfsync_clr *clr;
 	struct mbuf *mp;
 	int len = sizeof(*clr) * count;
 	int i, offp;
 	u_int32_t creatorid;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	clr = (struct pfsync_clr *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		creatorid = clr[i].creatorid;
 
 		if (clr[i].ifname[0] != '\0' &&
 		    pfi_kkif_find(clr[i].ifname) == NULL)
 			continue;
 
 		for (int i = 0; i <= pf_hashmask; i++) {
 			struct pf_idhash *ih = &V_pf_idhash[i];
 			struct pf_kstate *s;
 relock:
 			PF_HASHROW_LOCK(ih);
 			LIST_FOREACH(s, &ih->states, entry) {
 				if (s->creatorid == creatorid) {
 					s->state_flags |= PFSTATE_NOSYNC;
 					pf_unlink_state(s);
 					goto relock;
 				}
 			}
 			PF_HASHROW_UNLOCK(ih);
 		}
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_ins(struct mbuf *m, int offset, int count, int flags)
 {
 	struct mbuf *mp;
 	struct pfsync_state *sa, *sp;
 	int len = sizeof(*sp) * count;
 	int i, offp;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		/* Check for invalid values. */
 		if (sp->timeout >= PFTM_MAX ||
 		    sp->src.state > PF_TCPS_PROXY_DST ||
 		    sp->dst.state > PF_TCPS_PROXY_DST ||
 		    sp->direction > PF_OUT ||
 		    (sp->af != AF_INET && sp->af != AF_INET6)) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("%s: invalid value\n", __func__);
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		if (pfsync_state_import(sp, flags) == ENOMEM)
 			/* Drop out, but process the rest of the actions. */
 			break;
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_iack(struct mbuf *m, int offset, int count, int flags)
 {
 	struct pfsync_ins_ack *ia, *iaa;
 	struct pf_kstate *st;
 
 	struct mbuf *mp;
 	int len = count * sizeof(*ia);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	iaa = (struct pfsync_ins_ack *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		ia = &iaa[i];
 
 		st = pf_find_state_byid(ia->id, ia->creatorid);
 		if (st == NULL)
 			continue;
 
 		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_undefer_state(st, 0);
 		}
 		PF_STATE_UNLOCK(st);
 	}
 	/*
 	 * XXX this is not yet implemented, but we know the size of the
 	 * message so we can skip it.
 	 */
 
 	return (count * sizeof(struct pfsync_ins_ack));
 }
 
 static int
 pfsync_upd_tcp(struct pf_kstate *st, struct pfsync_state_peer *src,
     struct pfsync_state_peer *dst)
 {
 	int sync = 0;
 
 	PF_STATE_LOCK_ASSERT(st);
 
 	/*
 	 * The state should never go backwards except
 	 * for syn-proxy states.  Neither should the
 	 * sequence window slide backwards.
 	 */
 	if ((st->src.state > src->state &&
 	    (st->src.state < PF_TCPS_PROXY_SRC ||
 	    src->state >= PF_TCPS_PROXY_SRC)) ||
 
 	    (st->src.state == src->state &&
 	    SEQ_GT(st->src.seqlo, ntohl(src->seqlo))))
 		sync++;
 	else
 		pf_state_peer_ntoh(src, &st->src);
 
 	if ((st->dst.state > dst->state) ||
 
 	    (st->dst.state >= TCPS_SYN_SENT &&
 	    SEQ_GT(st->dst.seqlo, ntohl(dst->seqlo))))
 		sync++;
 	else
 		pf_state_peer_ntoh(dst, &st->dst);
 
 	return (sync);
 }
 
 static int
 pfsync_in_upd(struct mbuf *m, int offset, int count, int flags)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_state *sa, *sp;
 	struct pf_kstate *st;
 	int sync;
 
 	struct mbuf *mp;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		/* check for invalid values */
 		if (sp->timeout >= PFTM_MAX ||
 		    sp->src.state > PF_TCPS_PROXY_DST ||
 		    sp->dst.state > PF_TCPS_PROXY_DST) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				printf("pfsync_input: PFSYNC_ACT_UPD: "
 				    "invalid value\n");
 			}
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			/* insert the update */
 			if (pfsync_state_import(sp, flags))
 				V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 
 		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_undefer_state(st, 1);
 		}
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
 			sync = pfsync_upd_tcp(st, &sp->src, &sp->dst);
 		else {
 			sync = 0;
 
 			/*
 			 * Non-TCP protocol state machine always go
 			 * forwards
 			 */
 			if (st->src.state > sp->src.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&sp->src, &st->src);
 			if (st->dst.state > sp->dst.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&sp->dst, &st->dst);
 		}
 		if (sync < 2) {
 			pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
 			pf_state_peer_ntoh(&sp->dst, &st->dst);
 			st->expire = time_uptime;
 			st->timeout = sp->timeout;
 		}
 		st->pfsync_time = time_uptime;
 
 		if (sync) {
 			V_pfsyncstats.pfsyncs_stale++;
 
 			pfsync_update_state(st);
 			PF_STATE_UNLOCK(st);
 			pfsync_push_all(sc);
 			continue;
 		}
 		PF_STATE_UNLOCK(st);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_upd_c(struct mbuf *m, int offset, int count, int flags)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_upd_c *ua, *up;
 	struct pf_kstate *st;
 	int len = count * sizeof(*up);
 	int sync;
 	struct mbuf *mp;
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	ua = (struct pfsync_upd_c *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		up = &ua[i];
 
 		/* check for invalid values */
 		if (up->timeout >= PFTM_MAX ||
 		    up->src.state > PF_TCPS_PROXY_DST ||
 		    up->dst.state > PF_TCPS_PROXY_DST) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				printf("pfsync_input: "
 				    "PFSYNC_ACT_UPD_C: "
 				    "invalid value\n");
 			}
 			V_pfsyncstats.pfsyncs_badval++;
 			continue;
 		}
 
 		st = pf_find_state_byid(up->id, up->creatorid);
 		if (st == NULL) {
 			/* We don't have this state. Ask for it. */
 			PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]);
 			pfsync_request_update(up->creatorid, up->id);
 			PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]);
 			continue;
 		}
 
 		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_undefer_state(st, 1);
 		}
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
 			sync = pfsync_upd_tcp(st, &up->src, &up->dst);
 		else {
 			sync = 0;
 
 			/*
 			 * Non-TCP protocol state machine always go
 			 * forwards
 			 */
 			if (st->src.state > up->src.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&up->src, &st->src);
 			if (st->dst.state > up->dst.state)
 				sync++;
 			else
 				pf_state_peer_ntoh(&up->dst, &st->dst);
 		}
 		if (sync < 2) {
 			pfsync_alloc_scrub_memory(&up->dst, &st->dst);
 			pf_state_peer_ntoh(&up->dst, &st->dst);
 			st->expire = time_uptime;
 			st->timeout = up->timeout;
 		}
 		st->pfsync_time = time_uptime;
 
 		if (sync) {
 			V_pfsyncstats.pfsyncs_stale++;
 
 			pfsync_update_state(st);
 			PF_STATE_UNLOCK(st);
 			pfsync_push_all(sc);
 			continue;
 		}
 		PF_STATE_UNLOCK(st);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_ureq(struct mbuf *m, int offset, int count, int flags)
 {
 	struct pfsync_upd_req *ur, *ura;
 	struct mbuf *mp;
 	int len = count * sizeof(*ur);
 	int i, offp;
 
 	struct pf_kstate *st;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	ura = (struct pfsync_upd_req *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		ur = &ura[i];
 
 		if (ur->id == 0 && ur->creatorid == 0)
 			pfsync_bulk_start();
 		else {
 			st = pf_find_state_byid(ur->id, ur->creatorid);
 			if (st == NULL) {
 				V_pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
 			if (st->state_flags & PFSTATE_NOSYNC) {
 				PF_STATE_UNLOCK(st);
 				continue;
 			}
 
 			pfsync_update_state_req(st);
 			PF_STATE_UNLOCK(st);
 		}
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_del(struct mbuf *m, int offset, int count, int flags)
 {
 	struct mbuf *mp;
 	struct pfsync_state *sa, *sp;
 	struct pf_kstate *st;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_state *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 		st->state_flags |= PFSTATE_NOSYNC;
 		pf_unlink_state(st);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_del_c(struct mbuf *m, int offset, int count, int flags)
 {
 	struct mbuf *mp;
 	struct pfsync_del_c *sa, *sp;
 	struct pf_kstate *st;
 	int len = count * sizeof(*sp);
 	int offp, i;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	sa = (struct pfsync_del_c *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++) {
 		sp = &sa[i];
 
 		st = pf_find_state_byid(sp->id, sp->creatorid);
 		if (st == NULL) {
 			V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
 
 		st->state_flags |= PFSTATE_NOSYNC;
 		pf_unlink_state(st);
 	}
 
 	return (len);
 }
 
 static int
 pfsync_in_bus(struct mbuf *m, int offset, int count, int flags)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bus *bus;
 	struct mbuf *mp;
 	int len = count * sizeof(*bus);
 	int offp;
 
 	PFSYNC_BLOCK(sc);
 
 	/* If we're not waiting for a bulk update, who cares. */
 	if (sc->sc_ureq_sent == 0) {
 		PFSYNC_BUNLOCK(sc);
 		return (len);
 	}
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		PFSYNC_BUNLOCK(sc);
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	bus = (struct pfsync_bus *)(mp->m_data + offp);
 
 	switch (bus->status) {
 	case PFSYNC_BUS_START:
 		callout_reset(&sc->sc_bulkfail_tmo, 4 * hz +
 		    V_pf_limits[PF_LIMIT_STATES].limit /
 		    ((sc->sc_ifp->if_mtu - PFSYNC_MINPKT) /
 		    sizeof(struct pfsync_state)),
 		    pfsync_bulk_fail, sc);
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: received bulk update start\n");
 		break;
 
 	case PFSYNC_BUS_END:
 		if (time_uptime - ntohl(bus->endtime) >=
 		    sc->sc_ureq_sent) {
 			/* that's it, we're happy */
 			sc->sc_ureq_sent = 0;
 			sc->sc_bulk_tries = 0;
 			callout_stop(&sc->sc_bulkfail_tmo);
 			if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 				(*carp_demote_adj_p)(-V_pfsync_carp_adj,
 				    "pfsync bulk done");
 			sc->sc_flags |= PFSYNCF_OK;
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("pfsync: received valid "
 				    "bulk update end\n");
 		} else {
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf("pfsync: received invalid "
 				    "bulk update end: bad timestamp\n");
 		}
 		break;
 	}
 	PFSYNC_BUNLOCK(sc);
 
 	return (len);
 }
 
 static int
 pfsync_in_tdb(struct mbuf *m, int offset, int count, int flags)
 {
 	int len = count * sizeof(struct pfsync_tdb);
 
 #if defined(IPSEC)
 	struct pfsync_tdb *tp;
 	struct mbuf *mp;
 	int offp;
 	int i;
 	int s;
 
 	mp = m_pulldown(m, offset, len, &offp);
 	if (mp == NULL) {
 		V_pfsyncstats.pfsyncs_badlen++;
 		return (-1);
 	}
 	tp = (struct pfsync_tdb *)(mp->m_data + offp);
 
 	for (i = 0; i < count; i++)
 		pfsync_update_net_tdb(&tp[i]);
 #endif
 
 	return (len);
 }
 
 #if defined(IPSEC)
 /* Update an in-kernel tdb. Silently fail if no tdb is found. */
 static void
 pfsync_update_net_tdb(struct pfsync_tdb *pt)
 {
 	struct tdb		*tdb;
 	int			 s;
 
 	/* check for invalid values */
 	if (ntohl(pt->spi) <= SPI_RESERVED_MAX ||
 	    (pt->dst.sa.sa_family != AF_INET &&
 	    pt->dst.sa.sa_family != AF_INET6))
 		goto bad;
 
 	tdb = gettdb(pt->spi, &pt->dst, pt->sproto);
 	if (tdb) {
 		pt->rpl = ntohl(pt->rpl);
 		pt->cur_bytes = (unsigned long long)be64toh(pt->cur_bytes);
 
 		/* Neither replay nor byte counter should ever decrease. */
 		if (pt->rpl < tdb->tdb_rpl ||
 		    pt->cur_bytes < tdb->tdb_cur_bytes) {
 			goto bad;
 		}
 
 		tdb->tdb_rpl = pt->rpl;
 		tdb->tdb_cur_bytes = pt->cur_bytes;
 	}
 	return;
 
 bad:
 	if (V_pf_status.debug >= PF_DEBUG_MISC)
 		printf("pfsync_insert: PFSYNC_ACT_TDB_UPD: "
 		    "invalid value\n");
 	V_pfsyncstats.pfsyncs_badstate++;
 	return;
 }
 #endif
 
 static int
 pfsync_in_eof(struct mbuf *m, int offset, int count, int flags)
 {
 	/* check if we are at the right place in the packet */
 	if (offset != m->m_pkthdr.len)
 		V_pfsyncstats.pfsyncs_badlen++;
 
 	/* we're done. free and let the caller return */
 	m_freem(m);
 	return (-1);
 }
 
 static int
 pfsync_in_error(struct mbuf *m, int offset, int count, int flags)
 {
 	V_pfsyncstats.pfsyncs_badact++;
 
 	m_freem(m);
 	return (-1);
 }
 
 static int
 pfsyncoutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *rt)
 {
 	m_freem(m);
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct pfsync_softc *sc = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct pfsyncreq pfsyncr;
 	size_t nvbuflen;
 	int error;
 	int c;
 
 	switch (cmd) {
 	case SIOCSIFFLAGS:
 		PFSYNC_LOCK(sc);
 		if (ifp->if_flags & IFF_UP) {
 			ifp->if_drv_flags |= IFF_DRV_RUNNING;
 			PFSYNC_UNLOCK(sc);
 			pfsync_pointers_init();
 		} else {
 			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 			PFSYNC_UNLOCK(sc);
 			pfsync_pointers_uninit();
 		}
 		break;
 	case SIOCSIFMTU:
 		if (!sc->sc_sync_if ||
 		    ifr->ifr_mtu <= PFSYNC_MINPKT ||
 		    ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
 			return (EINVAL);
 		if (ifr->ifr_mtu < ifp->if_mtu) {
 			for (c = 0; c < pfsync_buckets; c++) {
 				PFSYNC_BUCKET_LOCK(&sc->sc_buckets[c]);
 				if (sc->sc_buckets[c].b_len > PFSYNC_MINPKT)
 					pfsync_sendout(1, c);
 				PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]);
 			}
 		}
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
 	case SIOCGETPFSYNC:
 		bzero(&pfsyncr, sizeof(pfsyncr));
 		PFSYNC_LOCK(sc);
 		if (sc->sc_sync_if) {
 			strlcpy(pfsyncr.pfsyncr_syncdev,
 			    sc->sc_sync_if->if_xname, IFNAMSIZ);
 		}
 		pfsyncr.pfsyncr_syncpeer = ((struct sockaddr_in *)&sc->sc_sync_peer)->sin_addr;
 		pfsyncr.pfsyncr_maxupdates = sc->sc_maxupdates;
 		pfsyncr.pfsyncr_defer = sc->sc_flags;
 		PFSYNC_UNLOCK(sc);
 		return (copyout(&pfsyncr, ifr_data_get_ptr(ifr),
 		    sizeof(pfsyncr)));
 
 	case SIOCGETPFSYNCNV:
 	    {
 		nvlist_t *nvl_syncpeer;
 		nvlist_t *nvl = nvlist_create(0);
 
 		if (nvl == NULL)
 			return (ENOMEM);
 
 		if (sc->sc_sync_if)
 			nvlist_add_string(nvl, "syncdev", sc->sc_sync_if->if_xname);
 		nvlist_add_number(nvl, "maxupdates", sc->sc_maxupdates);
 		nvlist_add_number(nvl, "flags", sc->sc_flags);
 		if ((nvl_syncpeer = pfsync_sockaddr_to_syncpeer_nvlist(&sc->sc_sync_peer)) != NULL)
 			nvlist_add_nvlist(nvl, "syncpeer", nvl_syncpeer);
 
 		void *packed = NULL;
 		packed = nvlist_pack(nvl, &nvbuflen);
 		if (packed == NULL) {
 			free(packed, M_NVLIST);
 			nvlist_destroy(nvl);
 			return (ENOMEM);
 		}
 
 		if (nvbuflen > ifr->ifr_cap_nv.buf_length) {
 			ifr->ifr_cap_nv.length = nvbuflen;
 			ifr->ifr_cap_nv.buffer = NULL;
 			free(packed, M_NVLIST);
 			nvlist_destroy(nvl);
 			return (EFBIG);
 		}
 
 		ifr->ifr_cap_nv.length = nvbuflen;
 		error = copyout(packed, ifr->ifr_cap_nv.buffer, nvbuflen);
 
 		nvlist_destroy(nvl);
 		nvlist_destroy(nvl_syncpeer);
 		free(packed, M_NVLIST);
 		break;
 	    }
 
 	case SIOCSETPFSYNC:
 	    {
 		struct pfsync_kstatus status;
 
 		if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
 			return (error);
 		if ((error = copyin(ifr_data_get_ptr(ifr), &pfsyncr,
 		    sizeof(pfsyncr))))
 			return (error);
 
 		memset((char *)&status, 0, sizeof(struct pfsync_kstatus));
 		pfsync_pfsyncreq_to_kstatus(&pfsyncr, &status);
 
 		error = pfsync_kstatus_to_softc(&status, sc);
 		return (error);
 	    }
 	case SIOCSETPFSYNCNV:
 	    {
 		struct pfsync_kstatus status;
 		void *data;
 		nvlist_t *nvl;
 
 		if ((error = priv_check(curthread, PRIV_NETINET_PF)) != 0)
 			return (error);
 		if (ifr->ifr_cap_nv.length > IFR_CAP_NV_MAXBUFSIZE)
 			return (EINVAL);
 
 		data = malloc(ifr->ifr_cap_nv.length, M_TEMP, M_WAITOK);
 
 		if ((error = copyin(ifr->ifr_cap_nv.buffer, data,
 		    ifr->ifr_cap_nv.length)) != 0) {
 			free(data, M_TEMP);
 			return (error);
 		}
 
 		if ((nvl = nvlist_unpack(data, ifr->ifr_cap_nv.length, 0)) == NULL) {
 			free(data, M_TEMP);
 			return (EINVAL);
 		}
 
 		memset((char *)&status, 0, sizeof(struct pfsync_kstatus));
 		pfsync_nvstatus_to_kstatus(nvl, &status);
 
 		nvlist_destroy(nvl);
 		free(data, M_TEMP);
 
 		error = pfsync_kstatus_to_softc(&status, sc);
 		return (error);
 	    }
 	default:
 		return (ENOTTY);
 	}
 
 	return (0);
 }
 
 static void
 pfsync_out_state(struct pf_kstate *st, void *buf)
 {
 	struct pfsync_state *sp = buf;
 
 	pfsync_state_export(sp, st);
 }
 
 static void
 pfsync_out_iack(struct pf_kstate *st, void *buf)
 {
 	struct pfsync_ins_ack *iack = buf;
 
 	iack->id = st->id;
 	iack->creatorid = st->creatorid;
 }
 
 static void
 pfsync_out_upd_c(struct pf_kstate *st, void *buf)
 {
 	struct pfsync_upd_c *up = buf;
 
 	bzero(up, sizeof(*up));
 	up->id = st->id;
 	pf_state_peer_hton(&st->src, &up->src);
 	pf_state_peer_hton(&st->dst, &up->dst);
 	up->creatorid = st->creatorid;
 	up->timeout = st->timeout;
 }
 
 static void
 pfsync_out_del(struct pf_kstate *st, void *buf)
 {
 	struct pfsync_del_c *dp = buf;
 
 	dp->id = st->id;
 	dp->creatorid = st->creatorid;
 	st->state_flags |= PFSTATE_NOSYNC;
 }
 
 static void
 pfsync_drop(struct pfsync_softc *sc)
 {
 	struct pf_kstate *st, *next;
 	struct pfsync_upd_req_item *ur;
 	struct pfsync_bucket *b;
 	int c, q;
 
 	for (c = 0; c < pfsync_buckets; c++) {
 		b = &sc->sc_buckets[c];
 		for (q = 0; q < PFSYNC_S_COUNT; q++) {
 			if (TAILQ_EMPTY(&b->b_qs[q]))
 				continue;
 
 			TAILQ_FOREACH_SAFE(st, &b->b_qs[q], sync_list, next) {
 				KASSERT(st->sync_state == q,
 					("%s: st->sync_state == q",
 						__func__));
 				st->sync_state = PFSYNC_S_NONE;
 				pf_release_state(st);
 			}
 			TAILQ_INIT(&b->b_qs[q]);
 		}
 
 		while ((ur = TAILQ_FIRST(&b->b_upd_req_list)) != NULL) {
 			TAILQ_REMOVE(&b->b_upd_req_list, ur, ur_entry);
 			free(ur, M_PFSYNC);
 		}
 
 		b->b_len = PFSYNC_MINPKT;
 		b->b_plus = NULL;
 	}
 }
 
 static void
 pfsync_sendout(int schedswi, int c)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct ifnet *ifp = sc->sc_ifp;
 	struct mbuf *m;
 	struct pfsync_header *ph;
 	struct pfsync_subheader *subh;
 	struct pf_kstate *st, *st_next;
 	struct pfsync_upd_req_item *ur;
 	struct pfsync_bucket *b = &sc->sc_buckets[c];
 	int aflen, offset;
 	int q, count = 0;
 
 	KASSERT(sc != NULL, ("%s: null sc", __func__));
 	KASSERT(b->b_len > PFSYNC_MINPKT,
 	    ("%s: sc_len %zu", __func__, b->b_len));
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
 		pfsync_drop(sc);
 		return;
 	}
 
 	m = m_get2(max_linkhdr + b->b_len, M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		V_pfsyncstats.pfsyncs_onomem++;
 		return;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len = m->m_pkthdr.len = b->b_len;
 
 	/* build the ip header */
 	switch (sc->sc_sync_peer.ss_family) {
 #ifdef INET
 	case AF_INET:
 	    {
 		struct ip *ip;
 
 		ip = mtod(m, struct ip *);
 		bcopy(&sc->sc_template.ipv4, ip, sizeof(*ip));
 		aflen = offset = sizeof(*ip);
 
 		ip->ip_len = htons(m->m_pkthdr.len);
 		ip_fillid(ip);
 		break;
 	    }
 #endif
 	default:
 		m_freem(m);
 		return;
 	}
 
 
 	/* build the pfsync header */
 	ph = (struct pfsync_header *)(m->m_data + offset);
 	bzero(ph, sizeof(*ph));
 	offset += sizeof(*ph);
 
 	ph->version = PFSYNC_VERSION;
 	ph->len = htons(b->b_len - aflen);
 	bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
 
 	/* walk the queues */
 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
 		if (TAILQ_EMPTY(&b->b_qs[q]))
 			continue;
 
 		subh = (struct pfsync_subheader *)(m->m_data + offset);
 		offset += sizeof(*subh);
 
 		count = 0;
 		TAILQ_FOREACH_SAFE(st, &b->b_qs[q], sync_list, st_next) {
 			KASSERT(st->sync_state == q,
 				("%s: st->sync_state == q",
 					__func__));
 			/*
 			 * XXXGL: some of write methods do unlocked reads
 			 * of state data :(
 			 */
 			pfsync_qs[q].write(st, m->m_data + offset);
 			offset += pfsync_qs[q].len;
 			st->sync_state = PFSYNC_S_NONE;
 			pf_release_state(st);
 			count++;
 		}
 		TAILQ_INIT(&b->b_qs[q]);
 
 		bzero(subh, sizeof(*subh));
 		subh->action = pfsync_qs[q].action;
 		subh->count = htons(count);
 		V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
 	}
 
 	if (!TAILQ_EMPTY(&b->b_upd_req_list)) {
 		subh = (struct pfsync_subheader *)(m->m_data + offset);
 		offset += sizeof(*subh);
 
 		count = 0;
 		while ((ur = TAILQ_FIRST(&b->b_upd_req_list)) != NULL) {
 			TAILQ_REMOVE(&b->b_upd_req_list, ur, ur_entry);
 
 			bcopy(&ur->ur_msg, m->m_data + offset,
 			    sizeof(ur->ur_msg));
 			offset += sizeof(ur->ur_msg);
 			free(ur, M_PFSYNC);
 			count++;
 		}
 
 		bzero(subh, sizeof(*subh));
 		subh->action = PFSYNC_ACT_UPD_REQ;
 		subh->count = htons(count);
 		V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_UPD_REQ] += count;
 	}
 
 	/* has someone built a custom region for us to add? */
 	if (b->b_plus != NULL) {
 		bcopy(b->b_plus, m->m_data + offset, b->b_pluslen);
 		offset += b->b_pluslen;
 
 		b->b_plus = NULL;
 	}
 
 	subh = (struct pfsync_subheader *)(m->m_data + offset);
 	offset += sizeof(*subh);
 
 	bzero(subh, sizeof(*subh));
 	subh->action = PFSYNC_ACT_EOF;
 	subh->count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_EOF]++;
 
 	/* we're done, let's put it on the wire */
 	if (ifp->if_bpf) {
 		m->m_data += aflen;
 		m->m_len = m->m_pkthdr.len = b->b_len - aflen;
 		BPF_MTAP(ifp, m);
 		m->m_data -= aflen;
 		m->m_len = m->m_pkthdr.len = b->b_len;
 	}
 
 	if (sc->sc_sync_if == NULL) {
 		b->b_len = PFSYNC_MINPKT;
 		m_freem(m);
 		return;
 	}
 
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
 	b->b_len = PFSYNC_MINPKT;
 
 	if (!_IF_QFULL(&b->b_snd))
 		_IF_ENQUEUE(&b->b_snd, m);
 	else {
 		m_freem(m);
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
 	}
 	if (schedswi)
 		swi_sched(V_pfsync_swi_cookie, 0);
 }
 
 static void
 pfsync_insert_state(struct pf_kstate *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	if (st->state_flags & PFSTATE_NOSYNC)
 		return;
 
 	if ((st->rule.ptr->rule_flag & PFRULE_NOSYNC) ||
 	    st->key[PF_SK_WIRE]->proto == IPPROTO_PFSYNC) {
 		st->state_flags |= PFSTATE_NOSYNC;
 		return;
 	}
 
 	KASSERT(st->sync_state == PFSYNC_S_NONE,
 		("%s: st->sync_state %u", __func__, st->sync_state));
 
 	PFSYNC_BUCKET_LOCK(b);
 	if (b->b_len == PFSYNC_MINPKT)
 		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
 
 	pfsync_q_ins(st, PFSYNC_S_INS, true);
 	PFSYNC_BUCKET_UNLOCK(b);
 
 	st->sync_updates = 0;
 }
 
 static int
 pfsync_defer(struct pf_kstate *st, struct mbuf *m)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_deferral *pd;
 	struct pfsync_bucket *b;
 
 	if (m->m_flags & (M_BCAST|M_MCAST))
 		return (0);
 
 	if (sc == NULL)
 		return (0);
 
 	b = pfsync_get_bucket(sc, st);
 
 	PFSYNC_LOCK(sc);
 
 	if (!(sc->sc_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
 	    !(sc->sc_flags & PFSYNCF_DEFER)) {
 		PFSYNC_UNLOCK(sc);
 		return (0);
 	}
 
 	PFSYNC_BUCKET_LOCK(b);
 	PFSYNC_UNLOCK(sc);
 
 	if (b->b_deferred >= 128)
 		pfsync_undefer(TAILQ_FIRST(&b->b_deferrals), 0);
 
 	pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
 	if (pd == NULL) {
 		PFSYNC_BUCKET_UNLOCK(b);
 		return (0);
 	}
 	b->b_deferred++;
 
 	m->m_flags |= M_SKIP_FIREWALL;
 	st->state_flags |= PFSTATE_ACK;
 
 	pd->pd_sc = sc;
 	pd->pd_refs = 0;
 	pd->pd_st = st;
 	pf_ref_state(st);
 	pd->pd_m = m;
 
 	TAILQ_INSERT_TAIL(&b->b_deferrals, pd, pd_entry);
 	callout_init_mtx(&pd->pd_tmo, &b->b_mtx, CALLOUT_RETURNUNLOCKED);
 	callout_reset(&pd->pd_tmo, PFSYNC_DEFER_TIMEOUT, pfsync_defer_tmo, pd);
 
 	pfsync_push(b);
 	PFSYNC_BUCKET_UNLOCK(b);
 
 	return (1);
 }
 
 static void
 pfsync_undefer(struct pfsync_deferral *pd, int drop)
 {
 	struct pfsync_softc *sc = pd->pd_sc;
 	struct mbuf *m = pd->pd_m;
 	struct pf_kstate *st = pd->pd_st;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
 	b->b_deferred--;
 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
 	free(pd, M_PFSYNC);
 	pf_release_state(st);
 
 	if (drop)
 		m_freem(m);
 	else {
 		_IF_ENQUEUE(&b->b_snd, m);
 		pfsync_push(b);
 	}
 }
 
 static void
 pfsync_defer_tmo(void *arg)
 {
 	struct epoch_tracker et;
 	struct pfsync_deferral *pd = arg;
 	struct pfsync_softc *sc = pd->pd_sc;
 	struct mbuf *m = pd->pd_m;
 	struct pf_kstate *st = pd->pd_st;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	if (sc->sc_sync_if == NULL)
 		return;
 
 	NET_EPOCH_ENTER(et);
 	CURVNET_SET(sc->sc_sync_if->if_vnet);
 
 	TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
 	b->b_deferred--;
 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
 	if (pd->pd_refs == 0)
 		free(pd, M_PFSYNC);
 	PFSYNC_BUCKET_UNLOCK(b);
 
 	switch (sc->sc_sync_peer.ss_family) {
 #ifdef INET
 	case AF_INET:
 		ip_output(m, NULL, NULL, 0, NULL, NULL);
 		break;
 #endif
 	}
 
 	pf_release_state(st);
 
 	CURVNET_RESTORE();
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 pfsync_undefer_state(struct pf_kstate *st, int drop)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_deferral *pd;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PFSYNC_BUCKET_LOCK(b);
 
 	TAILQ_FOREACH(pd, &b->b_deferrals, pd_entry) {
 		 if (pd->pd_st == st) {
 			if (callout_stop(&pd->pd_tmo) > 0)
 				pfsync_undefer(pd, drop);
 
 			PFSYNC_BUCKET_UNLOCK(b);
 			return;
 		}
 	}
 	PFSYNC_BUCKET_UNLOCK(b);
 
 	panic("%s: unable to find deferred state", __func__);
 }
 
 static struct pfsync_bucket*
 pfsync_get_bucket(struct pfsync_softc *sc, struct pf_kstate *st)
 {
 	int c = PF_IDHASH(st) % pfsync_buckets;
 	return &sc->sc_buckets[c];
 }
 
 static void
 pfsync_update_state(struct pf_kstate *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	bool sync = false, ref = true;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PF_STATE_LOCK_ASSERT(st);
 	PFSYNC_BUCKET_LOCK(b);
 
 	if (st->state_flags & PFSTATE_ACK)
 		pfsync_undefer_state(st, 0);
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st, true, b);
 		PFSYNC_BUCKET_UNLOCK(b);
 		return;
 	}
 
 	if (b->b_len == PFSYNC_MINPKT)
 		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
 
 	switch (st->sync_state) {
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_INS:
 		/* we're already handling it */
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP) {
 			st->sync_updates++;
 			if (st->sync_updates >= sc->sc_maxupdates)
 				sync = true;
 		}
 		break;
 
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st, false, b);
 		ref = false;
 		/* FALLTHROUGH */
 
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_UPD_C, ref);
 		st->sync_updates = 0;
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
 	if (sync || (time_uptime - st->pfsync_time) < 2)
 		pfsync_push(b);
 
 	PFSYNC_BUCKET_UNLOCK(b);
 }
 
 static void
 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bucket *b = &sc->sc_buckets[0];
 	struct pfsync_upd_req_item *item;
 	size_t nlen = sizeof(struct pfsync_upd_req);
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	/*
 	 * This code does a bit to prevent multiple update requests for the
 	 * same state being generated. It searches current subheader queue,
 	 * but it doesn't lookup into queue of already packed datagrams.
 	 */
 	TAILQ_FOREACH(item, &b->b_upd_req_list, ur_entry)
 		if (item->ur_msg.id == id &&
 		    item->ur_msg.creatorid == creatorid)
 			return;
 
 	item = malloc(sizeof(*item), M_PFSYNC, M_NOWAIT);
 	if (item == NULL)
 		return; /* XXX stats */
 
 	item->ur_msg.id = id;
 	item->ur_msg.creatorid = creatorid;
 
 	if (TAILQ_EMPTY(&b->b_upd_req_list))
 		nlen += sizeof(struct pfsync_subheader);
 
 	if (b->b_len + nlen > sc->sc_ifp->if_mtu) {
 		pfsync_sendout(0, 0);
 
 		nlen = sizeof(struct pfsync_subheader) +
 		    sizeof(struct pfsync_upd_req);
 	}
 
 	TAILQ_INSERT_TAIL(&b->b_upd_req_list, item, ur_entry);
 	b->b_len += nlen;
 
 	pfsync_push(b);
 }
 
 static bool
 pfsync_update_state_req(struct pf_kstate *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	bool ref = true, full = false;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PF_STATE_LOCK_ASSERT(st);
 	PFSYNC_BUCKET_LOCK(b);
 
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st, true, b);
 		PFSYNC_BUCKET_UNLOCK(b);
 		return (full);
 	}
 
 	switch (st->sync_state) {
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st, false, b);
 		ref = false;
 		/* FALLTHROUGH */
 
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_UPD, ref);
 		pfsync_push(b);
 		break;
 
 	case PFSYNC_S_INS:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_DEL:
 		/* we're already handling it */
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
 	if ((sc->sc_ifp->if_mtu - b->b_len) < sizeof(struct pfsync_state))
 		full = true;
 
 	PFSYNC_BUCKET_UNLOCK(b);
 
 	return (full);
 }
 
 static void
 pfsync_delete_state(struct pf_kstate *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 	bool ref = true;
 
 	PFSYNC_BUCKET_LOCK(b);
 	if (st->state_flags & PFSTATE_ACK)
 		pfsync_undefer_state(st, 1);
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
 			pfsync_q_del(st, true, b);
 		PFSYNC_BUCKET_UNLOCK(b);
 		return;
 	}
 
 	if (b->b_len == PFSYNC_MINPKT)
 		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
 
 	switch (st->sync_state) {
 	case PFSYNC_S_INS:
 		/* We never got to tell the world so just forget about it. */
 		pfsync_q_del(st, true, b);
 		break;
 
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_IACK:
 		pfsync_q_del(st, false, b);
 		ref = false;
 		/* FALLTHROUGH */
 
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_DEL, ref);
 		break;
 
 	default:
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
 	PFSYNC_BUCKET_UNLOCK(b);
 }
 
 static void
 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
 {
 	struct {
 		struct pfsync_subheader subh;
 		struct pfsync_clr clr;
 	} __packed r;
 
 	bzero(&r, sizeof(r));
 
 	r.subh.action = PFSYNC_ACT_CLR;
 	r.subh.count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_CLR]++;
 
 	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
 	r.clr.creatorid = creatorid;
 
 	pfsync_send_plus(&r, sizeof(r));
 }
 
 static void
 pfsync_q_ins(struct pf_kstate *st, int q, bool ref)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	size_t nlen = pfsync_qs[q].len;
 	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	KASSERT(st->sync_state == PFSYNC_S_NONE,
 		("%s: st->sync_state %u", __func__, st->sync_state));
 	KASSERT(b->b_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
 	    b->b_len));
 
 	if (TAILQ_EMPTY(&b->b_qs[q]))
 		nlen += sizeof(struct pfsync_subheader);
 
 	if (b->b_len + nlen > sc->sc_ifp->if_mtu) {
 		pfsync_sendout(1, b->b_id);
 
 		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
 	}
 
 	b->b_len += nlen;
 	TAILQ_INSERT_TAIL(&b->b_qs[q], st, sync_list);
 	st->sync_state = q;
 	if (ref)
 		pf_ref_state(st);
 }
 
 static void
 pfsync_q_del(struct pf_kstate *st, bool unref, struct pfsync_bucket *b)
 {
 	int q = st->sync_state;
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 	KASSERT(st->sync_state != PFSYNC_S_NONE,
 		("%s: st->sync_state != PFSYNC_S_NONE", __func__));
 
 	b->b_len -= pfsync_qs[q].len;
 	TAILQ_REMOVE(&b->b_qs[q], st, sync_list);
 	st->sync_state = PFSYNC_S_NONE;
 	if (unref)
 		pf_release_state(st);
 
 	if (TAILQ_EMPTY(&b->b_qs[q]))
 		b->b_len -= sizeof(struct pfsync_subheader);
 }
 
 static void
 pfsync_bulk_start(void)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	if (V_pf_status.debug >= PF_DEBUG_MISC)
 		printf("pfsync: received bulk update request\n");
 
 	PFSYNC_BLOCK(sc);
 
 	sc->sc_ureq_received = time_uptime;
 	sc->sc_bulk_hashid = 0;
 	sc->sc_bulk_stateid = 0;
 	pfsync_bulk_status(PFSYNC_BUS_START);
 	callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
 	PFSYNC_BUNLOCK(sc);
 }
 
 static void
 pfsync_bulk_update(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 	struct pf_kstate *s;
 	int i;
 
 	PFSYNC_BLOCK_ASSERT(sc);
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	/*
 	 * Start with last state from previous invocation.
 	 * It may had gone, in this case start from the
 	 * hash slot.
 	 */
 	s = pf_find_state_byid(sc->sc_bulk_stateid, sc->sc_bulk_creatorid);
 
 	if (s != NULL)
 		i = PF_IDHASH(s);
 	else
 		i = sc->sc_bulk_hashid;
 
 	for (; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 
 		if (s != NULL)
 			PF_HASHROW_ASSERT(ih);
 		else {
 			PF_HASHROW_LOCK(ih);
 			s = LIST_FIRST(&ih->states);
 		}
 
 		for (; s; s = LIST_NEXT(s, entry)) {
 			if (s->sync_state == PFSYNC_S_NONE &&
 			    s->timeout < PFTM_MAX &&
 			    s->pfsync_time <= sc->sc_ureq_received) {
 				if (pfsync_update_state_req(s)) {
 					/* We've filled a packet. */
 					sc->sc_bulk_hashid = i;
 					sc->sc_bulk_stateid = s->id;
 					sc->sc_bulk_creatorid = s->creatorid;
 					PF_HASHROW_UNLOCK(ih);
 					callout_reset(&sc->sc_bulk_tmo, 1,
 					    pfsync_bulk_update, sc);
 					goto full;
 				}
 			}
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 
 	/* We're done. */
 	pfsync_bulk_status(PFSYNC_BUS_END);
 full:
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_bulk_status(u_int8_t status)
 {
 	struct {
 		struct pfsync_subheader subh;
 		struct pfsync_bus bus;
 	} __packed r;
 
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	bzero(&r, sizeof(r));
 
 	r.subh.action = PFSYNC_ACT_BUS;
 	r.subh.count = htons(1);
 	V_pfsyncstats.pfsyncs_oacts[PFSYNC_ACT_BUS]++;
 
 	r.bus.creatorid = V_pf_status.hostid;
 	r.bus.endtime = htonl(time_uptime - sc->sc_ureq_received);
 	r.bus.status = status;
 
 	pfsync_send_plus(&r, sizeof(r));
 }
 
 static void
 pfsync_bulk_fail(void *arg)
 {
 	struct pfsync_softc *sc = arg;
 	struct pfsync_bucket *b = &sc->sc_buckets[0];
 
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	PFSYNC_BLOCK_ASSERT(sc);
 
 	if (sc->sc_bulk_tries++ < PFSYNC_MAX_BULKTRIES) {
 		/* Try again */
 		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz,
 		    pfsync_bulk_fail, V_pfsyncif);
 		PFSYNC_BUCKET_LOCK(b);
 		pfsync_request_update(0, 0);
 		PFSYNC_BUCKET_UNLOCK(b);
 	} else {
 		/* Pretend like the transfer was ok. */
 		sc->sc_ureq_sent = 0;
 		sc->sc_bulk_tries = 0;
 		PFSYNC_LOCK(sc);
 		if (!(sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 			(*carp_demote_adj_p)(-V_pfsync_carp_adj,
 			    "pfsync bulk fail");
 		sc->sc_flags |= PFSYNCF_OK;
 		PFSYNC_UNLOCK(sc);
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: failed to receive bulk update\n");
 	}
 
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_send_plus(void *plus, size_t pluslen)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_bucket *b = &sc->sc_buckets[0];
 
 	PFSYNC_BUCKET_LOCK(b);
 
 	if (b->b_len + pluslen > sc->sc_ifp->if_mtu)
 		pfsync_sendout(1, b->b_id);
 
 	b->b_plus = plus;
 	b->b_len += (b->b_pluslen = pluslen);
 
 	pfsync_sendout(1, b->b_id);
 	PFSYNC_BUCKET_UNLOCK(b);
 }
 
 static void
 pfsync_timeout(void *arg)
 {
 	struct pfsync_bucket *b = arg;
 
 	CURVNET_SET(b->b_sc->sc_ifp->if_vnet);
 	PFSYNC_BUCKET_LOCK(b);
 	pfsync_push(b);
 	PFSYNC_BUCKET_UNLOCK(b);
 	CURVNET_RESTORE();
 }
 
 static void
 pfsync_push(struct pfsync_bucket *b)
 {
 
 	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	b->b_flags |= PFSYNCF_BUCKET_PUSH;
 	swi_sched(V_pfsync_swi_cookie, 0);
 }
 
 static void
 pfsync_push_all(struct pfsync_softc *sc)
 {
 	int c;
 	struct pfsync_bucket *b;
 
 	for (c = 0; c < pfsync_buckets; c++) {
 		b = &sc->sc_buckets[c];
 
 		PFSYNC_BUCKET_LOCK(b);
 		pfsync_push(b);
 		PFSYNC_BUCKET_UNLOCK(b);
 	}
 }
 
 static void
 pfsyncintr(void *arg)
 {
 	struct epoch_tracker et;
 	struct pfsync_softc *sc = arg;
 	struct pfsync_bucket *b;
 	struct mbuf *m, *n;
 	int c, error;
 
 	NET_EPOCH_ENTER(et);
 	CURVNET_SET(sc->sc_ifp->if_vnet);
 
 	for (c = 0; c < pfsync_buckets; c++) {
 		b = &sc->sc_buckets[c];
 
 		PFSYNC_BUCKET_LOCK(b);
 		if ((b->b_flags & PFSYNCF_BUCKET_PUSH) && b->b_len > PFSYNC_MINPKT) {
 			pfsync_sendout(0, b->b_id);
 			b->b_flags &= ~PFSYNCF_BUCKET_PUSH;
 		}
 		_IF_DEQUEUE_ALL(&b->b_snd, m);
 		PFSYNC_BUCKET_UNLOCK(b);
 
 		for (; m != NULL; m = n) {
 			n = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 
 			/*
 			 * We distinguish between a deferral packet and our
 			 * own pfsync packet based on M_SKIP_FIREWALL
 			 * flag. This is XXX.
 			 */
 			switch (sc->sc_sync_peer.ss_family) {
 #ifdef INET
 			case AF_INET:
 				if (m->m_flags & M_SKIP_FIREWALL) {
 					error = ip_output(m, NULL, NULL, 0,
 					    NULL, NULL);
 				} else {
 					error = ip_output(m, NULL, NULL,
 					    IP_RAWOUTPUT, &sc->sc_imo, NULL);
 				}
 				break;
 #endif
 			}
 
 			if (error == 0)
 				V_pfsyncstats.pfsyncs_opackets++;
 			else
 				V_pfsyncstats.pfsyncs_oerrors++;
 		}
 	}
 	CURVNET_RESTORE();
 	NET_EPOCH_EXIT(et);
 }
 
 static int
 pfsync_multicast_setup(struct pfsync_softc *sc, struct ifnet *ifp,
     struct in_mfilter *imf)
 {
 	struct ip_moptions *imo = &sc->sc_imo;
 	int error;
 
 	if (!(ifp->if_flags & IFF_MULTICAST))
 		return (EADDRNOTAVAIL);
 
 	switch (sc->sc_sync_peer.ss_family) {
 #ifdef INET
 	case AF_INET:
 	    {
 		ip_mfilter_init(&imo->imo_head);
 		imo->imo_multicast_vif = -1;
 		if ((error = in_joingroup(ifp, &((struct sockaddr_in *)&sc->sc_sync_peer)->sin_addr, NULL,
 		    &imf->imf_inm)) != 0)
 			return (error);
 
 		ip_mfilter_insert(&imo->imo_head, imf);
 		imo->imo_multicast_ifp = ifp;
 		imo->imo_multicast_ttl = PFSYNC_DFLTTL;
 		imo->imo_multicast_loop = 0;
 		break;
 	    }
 #endif
 	}
 
 	return (0);
 }
 
 static void
 pfsync_multicast_cleanup(struct pfsync_softc *sc)
 {
 	struct ip_moptions *imo = &sc->sc_imo;
 	struct in_mfilter *imf;
 
 	while ((imf = ip_mfilter_first(&imo->imo_head)) != NULL) {
 		ip_mfilter_remove(&imo->imo_head, imf);
 		in_leavegroup(imf->imf_inm, NULL);
 		ip_mfilter_free(imf);
 	}
 	imo->imo_multicast_ifp = NULL;
 }
 
 void
 pfsync_detach_ifnet(struct ifnet *ifp)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 
 	if (sc == NULL)
 		return;
 
 	PFSYNC_LOCK(sc);
 
 	if (sc->sc_sync_if == ifp) {
 		/* We don't need mutlicast cleanup here, because the interface
 		 * is going away. We do need to ensure we don't try to do
 		 * cleanup later.
 		 */
 		ip_mfilter_init(&sc->sc_imo.imo_head);
 		sc->sc_imo.imo_multicast_ifp = NULL;
 		sc->sc_sync_if = NULL;
 	}
 
 	PFSYNC_UNLOCK(sc);
 }
 
 static int
 pfsync_pfsyncreq_to_kstatus(struct pfsyncreq *pfsyncr, struct pfsync_kstatus *status)
 {
 	struct sockaddr_storage sa;
 	status->maxupdates = pfsyncr->pfsyncr_maxupdates;
 	status->flags = pfsyncr->pfsyncr_defer;
 
 	strlcpy(status->syncdev, pfsyncr->pfsyncr_syncdev, IFNAMSIZ);
 
 	memset(&sa, 0, sizeof(sa));
 	if (pfsyncr->pfsyncr_syncpeer.s_addr != 0) {
 		struct sockaddr_in *in = (struct sockaddr_in *)&sa;
 		in->sin_family = AF_INET;
 		in->sin_len = sizeof(*in);
 		in->sin_addr.s_addr = pfsyncr->pfsyncr_syncpeer.s_addr;
 	}
 	status->syncpeer = sa;
 
 	return 0;
 }
 
 static int
 pfsync_kstatus_to_softc(struct pfsync_kstatus *status, struct pfsync_softc *sc)
 {
 	struct in_mfilter *imf = NULL;
 	struct ifnet *sifp;
 	struct ip *ip;
 	int error;
 	int c;
 
 	if ((status->maxupdates < 0) || (status->maxupdates > 255))
 		return (EINVAL);
 
 	if (status->syncdev[0] == '\0')
 		sifp = NULL;
 	else if ((sifp = ifunit_ref(status->syncdev)) == NULL)
 		return (EINVAL);
 
 	struct sockaddr_in *status_sin =
 	    (struct sockaddr_in *)&(status->syncpeer);
 	if (sifp != NULL && (status_sin->sin_addr.s_addr == 0 ||
 				status_sin->sin_addr.s_addr ==
 				    htonl(INADDR_PFSYNC_GROUP)))
 		imf = ip_mfilter_alloc(M_WAITOK, 0, 0);
 
 	PFSYNC_LOCK(sc);
 	struct sockaddr_in *sc_sin = (struct sockaddr_in *)&sc->sc_sync_peer;
 	sc_sin->sin_family = AF_INET;
 	sc_sin->sin_len = sizeof(*sc_sin);
 	if (status_sin->sin_addr.s_addr == 0) {
 		sc_sin->sin_addr.s_addr = htonl(INADDR_PFSYNC_GROUP);
 	} else {
 		sc_sin->sin_addr.s_addr = status_sin->sin_addr.s_addr;
 	}
 
 	sc->sc_maxupdates = status->maxupdates;
 	if (status->flags & PFSYNCF_DEFER) {
 		sc->sc_flags |= PFSYNCF_DEFER;
 		V_pfsync_defer_ptr = pfsync_defer;
 	} else {
 		sc->sc_flags &= ~PFSYNCF_DEFER;
 		V_pfsync_defer_ptr = NULL;
 	}
 
 	if (sifp == NULL) {
 		if (sc->sc_sync_if)
 			if_rele(sc->sc_sync_if);
 		sc->sc_sync_if = NULL;
 		pfsync_multicast_cleanup(sc);
 		PFSYNC_UNLOCK(sc);
 		return (0);
 	}
 
 	for (c = 0; c < pfsync_buckets; c++) {
 		PFSYNC_BUCKET_LOCK(&sc->sc_buckets[c]);
 		if (sc->sc_buckets[c].b_len > PFSYNC_MINPKT &&
 		    (sifp->if_mtu < sc->sc_ifp->if_mtu ||
 			(sc->sc_sync_if != NULL &&
 			    sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
 			sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
 			pfsync_sendout(1, c);
 		PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]);
 	}
 
 	pfsync_multicast_cleanup(sc);
 
 	if (sc_sin->sin_addr.s_addr == htonl(INADDR_PFSYNC_GROUP)) {
 		error = pfsync_multicast_setup(sc, sifp, imf);
 		if (error) {
 			if_rele(sifp);
 			ip_mfilter_free(imf);
 			PFSYNC_UNLOCK(sc);
 			return (error);
 		}
 	}
 	if (sc->sc_sync_if)
 		if_rele(sc->sc_sync_if);
 	sc->sc_sync_if = sifp;
 
 	ip = &sc->sc_template.ipv4;
 	bzero(ip, sizeof(*ip));
 	ip->ip_v = IPVERSION;
 	ip->ip_hl = sizeof(sc->sc_template.ipv4) >> 2;
 	ip->ip_tos = IPTOS_LOWDELAY;
 	/* len and id are set later. */
 	ip->ip_off = htons(IP_DF);
 	ip->ip_ttl = PFSYNC_DFLTTL;
 	ip->ip_p = IPPROTO_PFSYNC;
 	ip->ip_src.s_addr = INADDR_ANY;
 	ip->ip_dst.s_addr = sc_sin->sin_addr.s_addr;
 
 	/* Request a full state table update. */
 	if ((sc->sc_flags & PFSYNCF_OK) && carp_demote_adj_p)
 		(*carp_demote_adj_p)(V_pfsync_carp_adj,
 		    "pfsync bulk start");
 	sc->sc_flags &= ~PFSYNCF_OK;
 	if (V_pf_status.debug >= PF_DEBUG_MISC)
 		printf("pfsync: requesting bulk update\n");
 	PFSYNC_UNLOCK(sc);
 	PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]);
 	pfsync_request_update(0, 0);
 	PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]);
 	PFSYNC_BLOCK(sc);
 	sc->sc_ureq_sent = time_uptime;
 	callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail, sc);
 	PFSYNC_BUNLOCK(sc);
 	return (0);
 }
 
 static void
 pfsync_pointers_init(void)
 {
 
 	PF_RULES_WLOCK();
 	V_pfsync_state_import_ptr = pfsync_state_import;
 	V_pfsync_insert_state_ptr = pfsync_insert_state;
 	V_pfsync_update_state_ptr = pfsync_update_state;
 	V_pfsync_delete_state_ptr = pfsync_delete_state;
 	V_pfsync_clear_states_ptr = pfsync_clear_states;
 	V_pfsync_defer_ptr = pfsync_defer;
 	PF_RULES_WUNLOCK();
 }
 
 static void
 pfsync_pointers_uninit(void)
 {
 
 	PF_RULES_WLOCK();
 	V_pfsync_state_import_ptr = NULL;
 	V_pfsync_insert_state_ptr = NULL;
 	V_pfsync_update_state_ptr = NULL;
 	V_pfsync_delete_state_ptr = NULL;
 	V_pfsync_clear_states_ptr = NULL;
 	V_pfsync_defer_ptr = NULL;
 	PF_RULES_WUNLOCK();
 }
 
 static void
 vnet_pfsync_init(const void *unused __unused)
 {
 	int error;
 
 	V_pfsync_cloner = if_clone_simple(pfsyncname,
 	    pfsync_clone_create, pfsync_clone_destroy, 1);
 	error = swi_add(&V_pfsync_swi_ie, pfsyncname, pfsyncintr, V_pfsyncif,
 	    SWI_NET, INTR_MPSAFE, &V_pfsync_swi_cookie);
 	if (error) {
 		if_clone_detach(V_pfsync_cloner);
 		log(LOG_INFO, "swi_add() failed in %s\n", __func__);
 	}
 
 	pfsync_pointers_init();
 }
 VNET_SYSINIT(vnet_pfsync_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY,
     vnet_pfsync_init, NULL);
 
 static void
 vnet_pfsync_uninit(const void *unused __unused)
 {
 	int ret __diagused;
 
 	pfsync_pointers_uninit();
 
 	if_clone_detach(V_pfsync_cloner);
 	ret = swi_remove(V_pfsync_swi_cookie);
 	MPASS(ret == 0);
 	ret = intr_event_destroy(V_pfsync_swi_ie);
 	MPASS(ret == 0);
 }
 
 VNET_SYSUNINIT(vnet_pfsync_uninit, SI_SUB_PROTO_FIREWALL, SI_ORDER_FOURTH,
     vnet_pfsync_uninit, NULL);
 
 static int
 pfsync_init(void)
 {
 #ifdef INET
 	int error;
 
 	pfsync_detach_ifnet_ptr = pfsync_detach_ifnet;
 
 	error = ipproto_register(IPPROTO_PFSYNC, pfsync_input, NULL);
 	if (error)
 		return (error);
 #endif
 
 	return (0);
 }
 
 static void
 pfsync_uninit(void)
 {
 	pfsync_detach_ifnet_ptr = NULL;
 
 #ifdef INET
 	ipproto_unregister(IPPROTO_PFSYNC);
 #endif
 }
 
 static int
 pfsync_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		error = pfsync_init();
 		break;
 	case MOD_UNLOAD:
 		pfsync_uninit();
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static moduledata_t pfsync_mod = {
 	pfsyncname,
 	pfsync_modevent,
 	0
 };
 
 #define PFSYNC_MODVER 1
 
 /* Stay on FIREWALL as we depend on pf being initialized and on inetdomain. */
 DECLARE_MODULE(pfsync, pfsync_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
 MODULE_VERSION(pfsync, PFSYNC_MODVER);
 MODULE_DEPEND(pfsync, pf, PF_MODVER, PF_MODVER, PF_MODVER);
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index ca08023ea05d..0ccf1fcfc693 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -1,7961 +1,7962 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2001 Daniel Hartmeier
  * Copyright (c) 2002 - 2008 Henning Brauer
  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  *    - Redistributions of source code must retain the above copyright
  *      notice, this list of conditions and the following disclaimer.
  *    - Redistributions in binary form must reproduce the above
  *      copyright notice, this list of conditions and the following
  *      disclaimer in the documentation and/or other materials provided
  *      with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Effort sponsored in part by the Defense Advanced Research Projects
  * Agency (DARPA) and Air Force Research Laboratory, Air Force
  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
  *
  *	$OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bpf.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_pf.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
 #include <sys/gsb_crc32.h>
 #include <sys/hash.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/mbuf.h>
 #include <sys/md5.h>
 #include <sys/random.h>
 #include <sys/refcount.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/vnet.h>
 
 #include <net/pfil.h>
 #include <net/pfvar.h>
 #include <net/if_pflog.h>
 #include <net/if_pfsync.h>
 
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/in_fib.h>
 #include <netinet/ip.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 /* dummynet */
 #include <netinet/ip_dummynet.h>
 #include <netinet/ip_fw.h>
 #include <netpfil/ipfw/dn_heap.h>
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/ip_dn_private.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/scope6_var.h>
 #endif /* INET6 */
 
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <machine/in_cksum.h>
 #include <security/mac/mac_framework.h>
 
 #define	DPFPRINTF(n, x)	if (V_pf_status.debug >= (n)) printf x
 
 SDT_PROVIDER_DEFINE(pf);
 SDT_PROBE_DEFINE4(pf, ip, test, done, "int", "int", "struct pf_krule *",
     "struct pf_kstate *");
 SDT_PROBE_DEFINE4(pf, ip, test6, done, "int", "int", "struct pf_krule *",
     "struct pf_kstate *");
 SDT_PROBE_DEFINE5(pf, ip, state, lookup, "struct pfi_kkif *",
     "struct pf_state_key_cmp *", "int", "struct pf_pdesc *",
     "struct pf_kstate *");
 
 SDT_PROBE_DEFINE3(pf, eth, test_rule, entry, "int", "struct ifnet *",
     "struct mbuf *");
 SDT_PROBE_DEFINE2(pf, eth, test_rule, test, "int", "struct pf_keth_rule *");
 SDT_PROBE_DEFINE3(pf, eth, test_rule, mismatch,
     "int", "struct pf_keth_rule *", "char *");
 SDT_PROBE_DEFINE2(pf, eth, test_rule, match, "int", "struct pf_keth_rule *");
 SDT_PROBE_DEFINE2(pf, eth, test_rule, final_match,
     "int", "struct pf_keth_rule *");
 
 /*
  * Global variables
  */
 
 /* state tables */
 VNET_DEFINE(struct pf_altqqueue,	 pf_altqs[4]);
 VNET_DEFINE(struct pf_kpalist,		 pf_pabuf);
 VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_active);
 VNET_DEFINE(struct pf_altqqueue *,	 pf_altq_ifs_active);
 VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_inactive);
 VNET_DEFINE(struct pf_altqqueue *,	 pf_altq_ifs_inactive);
 VNET_DEFINE(struct pf_kstatus,		 pf_status);
 
 VNET_DEFINE(u_int32_t,			 ticket_altqs_active);
 VNET_DEFINE(u_int32_t,			 ticket_altqs_inactive);
 VNET_DEFINE(int,			 altqs_inactive_open);
 VNET_DEFINE(u_int32_t,			 ticket_pabuf);
 
 VNET_DEFINE(MD5_CTX,			 pf_tcp_secret_ctx);
 #define	V_pf_tcp_secret_ctx		 VNET(pf_tcp_secret_ctx)
 VNET_DEFINE(u_char,			 pf_tcp_secret[16]);
 #define	V_pf_tcp_secret			 VNET(pf_tcp_secret)
 VNET_DEFINE(int,			 pf_tcp_secret_init);
 #define	V_pf_tcp_secret_init		 VNET(pf_tcp_secret_init)
 VNET_DEFINE(int,			 pf_tcp_iss_off);
 #define	V_pf_tcp_iss_off		 VNET(pf_tcp_iss_off)
 VNET_DECLARE(int,			 pf_vnet_active);
 #define	V_pf_vnet_active		 VNET(pf_vnet_active)
 
 VNET_DEFINE_STATIC(uint32_t, pf_purge_idx);
 #define V_pf_purge_idx	VNET(pf_purge_idx)
 
 #ifdef PF_WANT_32_TO_64_COUNTER
 VNET_DEFINE_STATIC(uint32_t, pf_counter_periodic_iter);
 #define	V_pf_counter_periodic_iter	VNET(pf_counter_periodic_iter)
 
 VNET_DEFINE(struct allrulelist_head, pf_allrulelist);
 VNET_DEFINE(size_t, pf_allrulecount);
 VNET_DEFINE(struct pf_krule *, pf_rulemarker);
 #endif
 
 /*
  * Queue for pf_intr() sends.
  */
 static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
 struct pf_send_entry {
 	STAILQ_ENTRY(pf_send_entry)	pfse_next;
 	struct mbuf			*pfse_m;
 	enum {
 		PFSE_IP,
 		PFSE_IP6,
 		PFSE_ICMP,
 		PFSE_ICMP6,
 	}				pfse_type;
 	struct {
 		int		type;
 		int		code;
 		int		mtu;
 	} icmpopts;
 };
 
 STAILQ_HEAD(pf_send_head, pf_send_entry);
 VNET_DEFINE_STATIC(struct pf_send_head, pf_sendqueue);
 #define	V_pf_sendqueue	VNET(pf_sendqueue)
 
 static struct mtx_padalign pf_sendqueue_mtx;
 MTX_SYSINIT(pf_sendqueue_mtx, &pf_sendqueue_mtx, "pf send queue", MTX_DEF);
 #define	PF_SENDQ_LOCK()		mtx_lock(&pf_sendqueue_mtx)
 #define	PF_SENDQ_UNLOCK()	mtx_unlock(&pf_sendqueue_mtx)
 
 /*
  * Queue for pf_overload_task() tasks.
  */
 struct pf_overload_entry {
 	SLIST_ENTRY(pf_overload_entry)	next;
 	struct pf_addr  		addr;
 	sa_family_t			af;
 	uint8_t				dir;
 	struct pf_krule  		*rule;
 };
 
 SLIST_HEAD(pf_overload_head, pf_overload_entry);
 VNET_DEFINE_STATIC(struct pf_overload_head, pf_overloadqueue);
 #define V_pf_overloadqueue	VNET(pf_overloadqueue)
 VNET_DEFINE_STATIC(struct task, pf_overloadtask);
 #define	V_pf_overloadtask	VNET(pf_overloadtask)
 
 static struct mtx_padalign pf_overloadqueue_mtx;
 MTX_SYSINIT(pf_overloadqueue_mtx, &pf_overloadqueue_mtx,
     "pf overload/flush queue", MTX_DEF);
 #define	PF_OVERLOADQ_LOCK()	mtx_lock(&pf_overloadqueue_mtx)
 #define	PF_OVERLOADQ_UNLOCK()	mtx_unlock(&pf_overloadqueue_mtx)
 
 VNET_DEFINE(struct pf_krulequeue, pf_unlinked_rules);
 struct mtx_padalign pf_unlnkdrules_mtx;
 MTX_SYSINIT(pf_unlnkdrules_mtx, &pf_unlnkdrules_mtx, "pf unlinked rules",
     MTX_DEF);
 
 struct sx pf_config_lock;
 SX_SYSINIT(pf_config_lock, &pf_config_lock, "pf config");
 
 struct mtx_padalign pf_table_stats_lock;
 MTX_SYSINIT(pf_table_stats_lock, &pf_table_stats_lock, "pf table stats",
     MTX_DEF);
 
 VNET_DEFINE_STATIC(uma_zone_t,	pf_sources_z);
 #define	V_pf_sources_z	VNET(pf_sources_z)
 uma_zone_t		pf_mtag_z;
 VNET_DEFINE(uma_zone_t,	 pf_state_z);
 VNET_DEFINE(uma_zone_t,	 pf_state_key_z);
 
 VNET_DEFINE(struct unrhdr64, pf_stateid);
 
 static void		 pf_src_tree_remove_state(struct pf_kstate *);
 static void		 pf_init_threshold(struct pf_threshold *, u_int32_t,
 			    u_int32_t);
 static void		 pf_add_threshold(struct pf_threshold *);
 static int		 pf_check_threshold(struct pf_threshold *);
 
 static void		 pf_change_ap(struct mbuf *, struct pf_addr *, u_int16_t *,
 			    u_int16_t *, u_int16_t *, struct pf_addr *,
 			    u_int16_t, u_int8_t, sa_family_t);
 static int		 pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
 			    struct tcphdr *, struct pf_state_peer *);
 static void		 pf_change_icmp(struct pf_addr *, u_int16_t *,
 			    struct pf_addr *, struct pf_addr *, u_int16_t,
 			    u_int16_t *, u_int16_t *, u_int16_t *,
 			    u_int16_t *, u_int8_t, sa_family_t);
 static void		 pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
 			    sa_family_t, struct pf_krule *);
 static void		 pf_detach_state(struct pf_kstate *);
 static int		 pf_state_key_attach(struct pf_state_key *,
 			    struct pf_state_key *, struct pf_kstate *);
 static void		 pf_state_key_detach(struct pf_kstate *, int);
 static int		 pf_state_key_ctor(void *, int, void *, int);
 static u_int32_t	 pf_tcp_iss(struct pf_pdesc *);
 void			 pf_rule_to_actions(struct pf_krule *,
 			    struct pf_rule_actions *);
 static int		 pf_dummynet(struct pf_pdesc *, int, struct pf_kstate *,
 			    struct pf_krule *, struct mbuf **);
 static int		 pf_dummynet_route(struct pf_pdesc *, int,
 			    struct pf_kstate *, struct pf_krule *,
 			    struct ifnet *, struct sockaddr *, struct mbuf **);
 static int		 pf_test_eth_rule(int, struct pfi_kkif *,
 			    struct mbuf **);
 static int		 pf_test_rule(struct pf_krule **, struct pf_kstate **,
 			    int, struct pfi_kkif *, struct mbuf *, int,
 			    struct pf_pdesc *, struct pf_krule **,
 			    struct pf_kruleset **, struct inpcb *);
 static int		 pf_create_state(struct pf_krule *, struct pf_krule *,
 			    struct pf_krule *, struct pf_pdesc *,
 			    struct pf_ksrc_node *, struct pf_state_key *,
 			    struct pf_state_key *, struct mbuf *, int,
 			    u_int16_t, u_int16_t, int *, struct pfi_kkif *,
 			    struct pf_kstate **, int, u_int16_t, u_int16_t,
 			    int);
 static int		 pf_test_fragment(struct pf_krule **, int,
 			    struct pfi_kkif *, struct mbuf *, void *,
 			    struct pf_pdesc *, struct pf_krule **,
 			    struct pf_kruleset **);
 static int		 pf_tcp_track_full(struct pf_kstate **,
 			    struct pfi_kkif *, struct mbuf *, int,
 			    struct pf_pdesc *, u_short *, int *);
 static int		 pf_tcp_track_sloppy(struct pf_kstate **,
 			    struct pf_pdesc *, u_short *);
 static int		 pf_test_state_tcp(struct pf_kstate **, int,
 			    struct pfi_kkif *, struct mbuf *, int,
 			    void *, struct pf_pdesc *, u_short *);
 static int		 pf_test_state_udp(struct pf_kstate **, int,
 			    struct pfi_kkif *, struct mbuf *, int,
 			    void *, struct pf_pdesc *);
 static int		 pf_test_state_icmp(struct pf_kstate **, int,
 			    struct pfi_kkif *, struct mbuf *, int,
 			    void *, struct pf_pdesc *, u_short *);
 static int		 pf_test_state_other(struct pf_kstate **, int,
 			    struct pfi_kkif *, struct mbuf *, struct pf_pdesc *);
 static u_int16_t	 pf_calc_mss(struct pf_addr *, sa_family_t,
 				int, u_int16_t);
 static int		 pf_check_proto_cksum(struct mbuf *, int, int,
 			    u_int8_t, sa_family_t);
 static void		 pf_print_state_parts(struct pf_kstate *,
 			    struct pf_state_key *, struct pf_state_key *);
 static int		 pf_addr_wrap_neq(struct pf_addr_wrap *,
 			    struct pf_addr_wrap *);
 static void		 pf_patch_8(struct mbuf *, u_int16_t *, u_int8_t *, u_int8_t,
 			    bool, u_int8_t);
 static struct pf_kstate	*pf_find_state(struct pfi_kkif *,
 			    struct pf_state_key_cmp *, u_int);
 static int		 pf_src_connlimit(struct pf_kstate **);
 static void		 pf_overload_task(void *v, int pending);
 static int		 pf_insert_src_node(struct pf_ksrc_node **,
 			    struct pf_krule *, struct pf_addr *, sa_family_t);
 static u_int		 pf_purge_expired_states(u_int, int);
 static void		 pf_purge_unlinked_rules(void);
 static int		 pf_mtag_uminit(void *, int, int);
 static void		 pf_mtag_free(struct m_tag *);
 static void		 pf_packet_rework_nat(struct mbuf *, struct pf_pdesc *,
 			    int, struct pf_state_key *);
 #ifdef INET
 static void		 pf_route(struct mbuf **, struct pf_krule *, int,
 			    struct ifnet *, struct pf_kstate *,
 			    struct pf_pdesc *, struct inpcb *);
 #endif /* INET */
 #ifdef INET6
 static void		 pf_change_a6(struct pf_addr *, u_int16_t *,
 			    struct pf_addr *, u_int8_t);
 static void		 pf_route6(struct mbuf **, struct pf_krule *, int,
 			    struct ifnet *, struct pf_kstate *,
 			    struct pf_pdesc *, struct inpcb *);
 #endif /* INET6 */
 static __inline void pf_set_protostate(struct pf_kstate *, int, u_int8_t);
 
 int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
 
 extern int pf_end_threads;
 extern struct proc *pf_purge_proc;
 
 VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
 
 #define	PACKET_UNDO_NAT(_m, _pd, _off, _s, _dir)		\
 	do {								\
 		struct pf_state_key *nk;				\
 		if ((_dir) == PF_OUT)					\
 			nk = (_s)->key[PF_SK_STACK];			\
 		else							\
 			nk = (_s)->key[PF_SK_WIRE];			\
 		pf_packet_rework_nat(_m, _pd, _off, nk);		\
 	} while (0)
 
 #define	PACKET_LOOPED(pd)	((pd)->pf_mtag &&			\
 				 (pd)->pf_mtag->flags & PF_PACKET_LOOPED)
 
 #define	STATE_LOOKUP(i, k, d, s, pd)					\
 	do {								\
 		(s) = pf_find_state((i), (k), (d));			\
 		SDT_PROBE5(pf, ip, state, lookup, i, k, d, pd, (s));	\
 		if ((s) == NULL)					\
 			return (PF_DROP);				\
 		if (PACKET_LOOPED(pd))					\
 			return (PF_PASS);				\
 	} while (0)
 
 #define	BOUND_IFACE(r, k) \
 	((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all
 
 #define	STATE_INC_COUNTERS(s)						\
 	do {								\
 		counter_u64_add(s->rule.ptr->states_cur, 1);		\
 		counter_u64_add(s->rule.ptr->states_tot, 1);		\
 		if (s->anchor.ptr != NULL) {				\
 			counter_u64_add(s->anchor.ptr->states_cur, 1);	\
 			counter_u64_add(s->anchor.ptr->states_tot, 1);	\
 		}							\
 		if (s->nat_rule.ptr != NULL) {				\
 			counter_u64_add(s->nat_rule.ptr->states_cur, 1);\
 			counter_u64_add(s->nat_rule.ptr->states_tot, 1);\
 		}							\
 	} while (0)
 
 #define	STATE_DEC_COUNTERS(s)						\
 	do {								\
 		if (s->nat_rule.ptr != NULL)				\
 			counter_u64_add(s->nat_rule.ptr->states_cur, -1);\
 		if (s->anchor.ptr != NULL)				\
 			counter_u64_add(s->anchor.ptr->states_cur, -1);	\
 		counter_u64_add(s->rule.ptr->states_cur, -1);		\
 	} while (0)
 
 MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
 VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
 VNET_DEFINE(struct pf_idhash *, pf_idhash);
 VNET_DEFINE(struct pf_srchash *, pf_srchash);
 
 SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "pf(4)");
 
 u_long	pf_hashmask;
 u_long	pf_srchashmask;
 static u_long	pf_hashsize;
 static u_long	pf_srchashsize;
 u_long	pf_ioctl_maxcount = 65535;
 
 SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
     &pf_hashsize, 0, "Size of pf(4) states hashtable");
 SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
     &pf_srchashsize, 0, "Size of pf(4) source nodes hashtable");
 SYSCTL_ULONG(_net_pf, OID_AUTO, request_maxcount, CTLFLAG_RWTUN,
     &pf_ioctl_maxcount, 0, "Maximum number of tables, addresses, ... in a single ioctl() call");
 
 VNET_DEFINE(void *, pf_swi_cookie);
 VNET_DEFINE(struct intr_event *, pf_swi_ie);
 
 VNET_DEFINE(uint32_t, pf_hashseed);
 #define	V_pf_hashseed	VNET(pf_hashseed)
 
 int
 pf_addr_cmp(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
 {
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		if (a->addr32[0] > b->addr32[0])
 			return (1);
 		if (a->addr32[0] < b->addr32[0])
 			return (-1);
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		if (a->addr32[3] > b->addr32[3])
 			return (1);
 		if (a->addr32[3] < b->addr32[3])
 			return (-1);
 		if (a->addr32[2] > b->addr32[2])
 			return (1);
 		if (a->addr32[2] < b->addr32[2])
 			return (-1);
 		if (a->addr32[1] > b->addr32[1])
 			return (1);
 		if (a->addr32[1] < b->addr32[1])
 			return (-1);
 		if (a->addr32[0] > b->addr32[0])
 			return (1);
 		if (a->addr32[0] < b->addr32[0])
 			return (-1);
 		break;
 #endif /* INET6 */
 	default:
 		panic("%s: unknown address family %u", __func__, af);
 	}
 	return (0);
 }
 
 static void
 pf_packet_rework_nat(struct mbuf *m, struct pf_pdesc *pd, int off,
 	struct pf_state_key *nk)
 {
 
 	switch (pd->proto) {
 	case IPPROTO_TCP: {
 		struct tcphdr *th = &pd->hdr.tcp;
 
 		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af))
 			pf_change_ap(m, pd->src, &th->th_sport, pd->ip_sum,
 			    &th->th_sum, &nk->addr[pd->sidx],
 			    nk->port[pd->sidx], 0, pd->af);
 		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af))
 			pf_change_ap(m, pd->dst, &th->th_dport, pd->ip_sum,
 			    &th->th_sum, &nk->addr[pd->didx],
 			    nk->port[pd->didx], 0, pd->af);
 		m_copyback(m, off, sizeof(*th), (caddr_t)th);
 		break;
 	}
 	case IPPROTO_UDP: {
 		struct udphdr *uh = &pd->hdr.udp;
 
 		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af))
 			pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum,
 			    &uh->uh_sum, &nk->addr[pd->sidx],
 			    nk->port[pd->sidx], 1, pd->af);
 		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af))
 			pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum,
 			    &uh->uh_sum, &nk->addr[pd->didx],
 			    nk->port[pd->didx], 1, pd->af);
 		m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
 		break;
 	}
 	case IPPROTO_ICMP: {
 		struct icmp *ih = &pd->hdr.icmp;
 
 		if (nk->port[pd->sidx] != ih->icmp_id) {
 			pd->hdr.icmp.icmp_cksum = pf_cksum_fixup(
 			    ih->icmp_cksum, ih->icmp_id,
 			    nk->port[pd->sidx], 0);
 			ih->icmp_id = nk->port[pd->sidx];
 			pd->sport = &ih->icmp_id;
 
 			m_copyback(m, off, ICMP_MINLEN, (caddr_t)ih);
 		}
 		/* FALLTHROUGH */
 	}
 	default:
 		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af)) {
 			switch (pd->af) {
 			case AF_INET:
 				pf_change_a(&pd->src->v4.s_addr,
 				    pd->ip_sum, nk->addr[pd->sidx].v4.s_addr,
 				    0);
 				break;
 			case AF_INET6:
 				PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
 				break;
 			}
 		}
 		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af)) {
 			switch (pd->af) {
 			case AF_INET:
 				pf_change_a(&pd->dst->v4.s_addr,
 				    pd->ip_sum, nk->addr[pd->didx].v4.s_addr,
 				    0);
 				break;
 			case AF_INET6:
 				PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
 				break;
 			}
 		}
 		break;
 	}
 }
 
 static __inline uint32_t
 pf_hashkey(struct pf_state_key *sk)
 {
 	uint32_t h;
 
 	h = murmur3_32_hash32((uint32_t *)sk,
 	    sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
 	    V_pf_hashseed);
 
 	return (h & pf_hashmask);
 }
 
 static __inline uint32_t
 pf_hashsrc(struct pf_addr *addr, sa_family_t af)
 {
 	uint32_t h;
 
 	switch (af) {
 	case AF_INET:
 		h = murmur3_32_hash32((uint32_t *)&addr->v4,
 		    sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed);
 		break;
 	case AF_INET6:
 		h = murmur3_32_hash32((uint32_t *)&addr->v6,
 		    sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed);
 		break;
 	default:
 		panic("%s: unknown address family %u", __func__, af);
 	}
 
 	return (h & pf_srchashmask);
 }
 
 #ifdef ALTQ
 static int
 pf_state_hash(struct pf_kstate *s)
 {
 	u_int32_t hv = (intptr_t)s / sizeof(*s);
 
 	hv ^= crc32(&s->src, sizeof(s->src));
 	hv ^= crc32(&s->dst, sizeof(s->dst));
 	if (hv == 0)
 		hv = 1;
 	return (hv);
 }
 #endif
 
 static __inline void
 pf_set_protostate(struct pf_kstate *s, int which, u_int8_t newstate)
 {
 	if (which == PF_PEER_DST || which == PF_PEER_BOTH)
 		s->dst.state = newstate;
 	if (which == PF_PEER_DST)
 		return;
 	if (s->src.state == newstate)
 		return;
 	if (s->creatorid == V_pf_status.hostid &&
 	    s->key[PF_SK_STACK] != NULL &&
 	    s->key[PF_SK_STACK]->proto == IPPROTO_TCP &&
 	    !(TCPS_HAVEESTABLISHED(s->src.state) ||
 	    s->src.state == TCPS_CLOSED) &&
 	    (TCPS_HAVEESTABLISHED(newstate) || newstate == TCPS_CLOSED))
 		atomic_add_32(&V_pf_status.states_halfopen, -1);
 
 	s->src.state = newstate;
 }
 
 #ifdef INET6
 void
 pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		dst->addr32[0] = src->addr32[0];
 		break;
 #endif /* INET */
 	case AF_INET6:
 		dst->addr32[0] = src->addr32[0];
 		dst->addr32[1] = src->addr32[1];
 		dst->addr32[2] = src->addr32[2];
 		dst->addr32[3] = src->addr32[3];
 		break;
 	}
 }
 #endif /* INET6 */
 
 static void
 pf_init_threshold(struct pf_threshold *threshold,
     u_int32_t limit, u_int32_t seconds)
 {
 	threshold->limit = limit * PF_THRESHOLD_MULT;
 	threshold->seconds = seconds;
 	threshold->count = 0;
 	threshold->last = time_uptime;
 }
 
 static void
 pf_add_threshold(struct pf_threshold *threshold)
 {
 	u_int32_t t = time_uptime, diff = t - threshold->last;
 
 	if (diff >= threshold->seconds)
 		threshold->count = 0;
 	else
 		threshold->count -= threshold->count * diff /
 		    threshold->seconds;
 	threshold->count += PF_THRESHOLD_MULT;
 	threshold->last = t;
 }
 
 static int
 pf_check_threshold(struct pf_threshold *threshold)
 {
 	return (threshold->count > threshold->limit);
 }
 
 static int
 pf_src_connlimit(struct pf_kstate **state)
 {
 	struct pf_overload_entry *pfoe;
 	int bad = 0;
 
 	PF_STATE_LOCK_ASSERT(*state);
 
 	(*state)->src_node->conn++;
 	(*state)->src.tcp_est = 1;
 	pf_add_threshold(&(*state)->src_node->conn_rate);
 
 	if ((*state)->rule.ptr->max_src_conn &&
 	    (*state)->rule.ptr->max_src_conn <
 	    (*state)->src_node->conn) {
 		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1);
 		bad++;
 	}
 
 	if ((*state)->rule.ptr->max_src_conn_rate.limit &&
 	    pf_check_threshold(&(*state)->src_node->conn_rate)) {
 		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1);
 		bad++;
 	}
 
 	if (!bad)
 		return (0);
 
 	/* Kill this state. */
 	(*state)->timeout = PFTM_PURGE;
 	pf_set_protostate(*state, PF_PEER_BOTH, TCPS_CLOSED);
 
 	if ((*state)->rule.ptr->overload_tbl == NULL)
 		return (1);
 
 	/* Schedule overloading and flushing task. */
 	pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT);
 	if (pfoe == NULL)
 		return (1);	/* too bad :( */
 
 	bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr));
 	pfoe->af = (*state)->key[PF_SK_WIRE]->af;
 	pfoe->rule = (*state)->rule.ptr;
 	pfoe->dir = (*state)->direction;
 	PF_OVERLOADQ_LOCK();
 	SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next);
 	PF_OVERLOADQ_UNLOCK();
 	taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask);
 
 	return (1);
 }
 
 static void
 pf_overload_task(void *v, int pending)
 {
 	struct pf_overload_head queue;
 	struct pfr_addr p;
 	struct pf_overload_entry *pfoe, *pfoe1;
 	uint32_t killed = 0;
 
 	CURVNET_SET((struct vnet *)v);
 
 	PF_OVERLOADQ_LOCK();
 	queue = V_pf_overloadqueue;
 	SLIST_INIT(&V_pf_overloadqueue);
 	PF_OVERLOADQ_UNLOCK();
 
 	bzero(&p, sizeof(p));
 	SLIST_FOREACH(pfoe, &queue, next) {
 		counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1);
 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			printf("%s: blocking address ", __func__);
 			pf_print_host(&pfoe->addr, 0, pfoe->af);
 			printf("\n");
 		}
 
 		p.pfra_af = pfoe->af;
 		switch (pfoe->af) {
 #ifdef INET
 		case AF_INET:
 			p.pfra_net = 32;
 			p.pfra_ip4addr = pfoe->addr.v4;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			p.pfra_net = 128;
 			p.pfra_ip6addr = pfoe->addr.v6;
 			break;
 #endif
 		}
 
 		PF_RULES_WLOCK();
 		pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second);
 		PF_RULES_WUNLOCK();
 	}
 
 	/*
 	 * Remove those entries, that don't need flushing.
 	 */
 	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
 		if (pfoe->rule->flush == 0) {
 			SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next);
 			free(pfoe, M_PFTEMP);
 		} else
 			counter_u64_add(
 			    V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1);
 
 	/* If nothing to flush, return. */
 	if (SLIST_EMPTY(&queue)) {
 		CURVNET_RESTORE();
 		return;
 	}
 
 	for (int i = 0; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 		struct pf_state_key *sk;
 		struct pf_kstate *s;
 
 		PF_HASHROW_LOCK(ih);
 		LIST_FOREACH(s, &ih->states, entry) {
 		    sk = s->key[PF_SK_WIRE];
 		    SLIST_FOREACH(pfoe, &queue, next)
 			if (sk->af == pfoe->af &&
 			    ((pfoe->rule->flush & PF_FLUSH_GLOBAL) ||
 			    pfoe->rule == s->rule.ptr) &&
 			    ((pfoe->dir == PF_OUT &&
 			    PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) ||
 			    (pfoe->dir == PF_IN &&
 			    PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) {
 				s->timeout = PFTM_PURGE;
 				pf_set_protostate(s, PF_PEER_BOTH, TCPS_CLOSED);
 				killed++;
 			}
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
 		free(pfoe, M_PFTEMP);
 	if (V_pf_status.debug >= PF_DEBUG_MISC)
 		printf("%s: %u states killed", __func__, killed);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Can return locked on failure, so that we can consistently
  * allocate and insert a new one.
  */
 struct pf_ksrc_node *
 pf_find_src_node(struct pf_addr *src, struct pf_krule *rule, sa_family_t af,
 	int returnlocked)
 {
 	struct pf_srchash *sh;
 	struct pf_ksrc_node *n;
 
 	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1);
 
 	sh = &V_pf_srchash[pf_hashsrc(src, af)];
 	PF_HASHROW_LOCK(sh);
 	LIST_FOREACH(n, &sh->nodes, entry)
 		if (n->rule.ptr == rule && n->af == af &&
 		    ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
 		    (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
 			break;
 	if (n != NULL) {
 		n->states++;
 		PF_HASHROW_UNLOCK(sh);
 	} else if (returnlocked == 0)
 		PF_HASHROW_UNLOCK(sh);
 
 	return (n);
 }
 
 static void
 pf_free_src_node(struct pf_ksrc_node *sn)
 {
 
 	for (int i = 0; i < 2; i++) {
 		counter_u64_free(sn->bytes[i]);
 		counter_u64_free(sn->packets[i]);
 	}
 	uma_zfree(V_pf_sources_z, sn);
 }
 
 static int
 pf_insert_src_node(struct pf_ksrc_node **sn, struct pf_krule *rule,
     struct pf_addr *src, sa_family_t af)
 {
 
 	KASSERT((rule->rule_flag & PFRULE_SRCTRACK ||
 	    rule->rpool.opts & PF_POOL_STICKYADDR),
 	    ("%s for non-tracking rule %p", __func__, rule));
 
 	if (*sn == NULL)
 		*sn = pf_find_src_node(src, rule, af, 1);
 
 	if (*sn == NULL) {
 		struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)];
 
 		PF_HASHROW_ASSERT(sh);
 
 		if (!rule->max_src_nodes ||
 		    counter_u64_fetch(rule->src_nodes) < rule->max_src_nodes)
 			(*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
 		else
 			counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES],
 			    1);
 		if ((*sn) == NULL) {
 			PF_HASHROW_UNLOCK(sh);
 			return (-1);
 		}
 
 		for (int i = 0; i < 2; i++) {
 			(*sn)->bytes[i] = counter_u64_alloc(M_NOWAIT);
 			(*sn)->packets[i] = counter_u64_alloc(M_NOWAIT);
 
 			if ((*sn)->bytes[i] == NULL || (*sn)->packets[i] == NULL) {
 				pf_free_src_node(*sn);
 				PF_HASHROW_UNLOCK(sh);
 				return (-1);
 			}
 		}
 
 		pf_init_threshold(&(*sn)->conn_rate,
 		    rule->max_src_conn_rate.limit,
 		    rule->max_src_conn_rate.seconds);
 
 		(*sn)->af = af;
 		(*sn)->rule.ptr = rule;
 		PF_ACPY(&(*sn)->addr, src, af);
 		LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
 		(*sn)->creation = time_uptime;
 		(*sn)->ruletype = rule->action;
 		(*sn)->states = 1;
 		if ((*sn)->rule.ptr != NULL)
 			counter_u64_add((*sn)->rule.ptr->src_nodes, 1);
 		PF_HASHROW_UNLOCK(sh);
 		counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1);
 	} else {
 		if (rule->max_src_states &&
 		    (*sn)->states >= rule->max_src_states) {
 			counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES],
 			    1);
 			return (-1);
 		}
 	}
 	return (0);
 }
 
 void
 pf_unlink_src_node(struct pf_ksrc_node *src)
 {
 
 	PF_HASHROW_ASSERT(&V_pf_srchash[pf_hashsrc(&src->addr, src->af)]);
 	LIST_REMOVE(src, entry);
 	if (src->rule.ptr)
 		counter_u64_add(src->rule.ptr->src_nodes, -1);
 }
 
 u_int
 pf_free_src_nodes(struct pf_ksrc_node_list *head)
 {
 	struct pf_ksrc_node *sn, *tmp;
 	u_int count = 0;
 
 	LIST_FOREACH_SAFE(sn, head, entry, tmp) {
 		pf_free_src_node(sn);
 		count++;
 	}
 
 	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], count);
 
 	return (count);
 }
 
 void
 pf_mtag_initialize(void)
 {
 
 	pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
 	    sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL,
 	    UMA_ALIGN_PTR, 0);
 }
 
 /* Per-vnet data storage structures initialization. */
 void
 pf_initialize(void)
 {
 	struct pf_keyhash	*kh;
 	struct pf_idhash	*ih;
 	struct pf_srchash	*sh;
 	u_int i;
 
 	if (pf_hashsize == 0 || !powerof2(pf_hashsize))
 		pf_hashsize = PF_HASHSIZ;
 	if (pf_srchashsize == 0 || !powerof2(pf_srchashsize))
 		pf_srchashsize = PF_SRCHASHSIZ;
 
 	V_pf_hashseed = arc4random();
 
 	/* States and state keys storage. */
 	V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_kstate),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
 	uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
 	uma_zone_set_warning(V_pf_state_z, "PF states limit reached");
 
 	V_pf_state_key_z = uma_zcreate("pf state keys",
 	    sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 
 	V_pf_keyhash = mallocarray(pf_hashsize, sizeof(struct pf_keyhash),
 	    M_PFHASH, M_NOWAIT | M_ZERO);
 	V_pf_idhash = mallocarray(pf_hashsize, sizeof(struct pf_idhash),
 	    M_PFHASH, M_NOWAIT | M_ZERO);
 	if (V_pf_keyhash == NULL || V_pf_idhash == NULL) {
 		printf("pf: Unable to allocate memory for "
 		    "state_hashsize %lu.\n", pf_hashsize);
 
 		free(V_pf_keyhash, M_PFHASH);
 		free(V_pf_idhash, M_PFHASH);
 
 		pf_hashsize = PF_HASHSIZ;
 		V_pf_keyhash = mallocarray(pf_hashsize,
 		    sizeof(struct pf_keyhash), M_PFHASH, M_WAITOK | M_ZERO);
 		V_pf_idhash = mallocarray(pf_hashsize,
 		    sizeof(struct pf_idhash), M_PFHASH, M_WAITOK | M_ZERO);
 	}
 
 	pf_hashmask = pf_hashsize - 1;
 	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
 	    i++, kh++, ih++) {
 		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK);
 		mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
 	}
 
 	/* Source nodes. */
 	V_pf_sources_z = uma_zcreate("pf source nodes",
 	    sizeof(struct pf_ksrc_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    0);
 	V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
 	uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
 	uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached");
 
 	V_pf_srchash = mallocarray(pf_srchashsize,
 	    sizeof(struct pf_srchash), M_PFHASH, M_NOWAIT | M_ZERO);
 	if (V_pf_srchash == NULL) {
 		printf("pf: Unable to allocate memory for "
 		    "source_hashsize %lu.\n", pf_srchashsize);
 
 		pf_srchashsize = PF_SRCHASHSIZ;
 		V_pf_srchash = mallocarray(pf_srchashsize,
 		    sizeof(struct pf_srchash), M_PFHASH, M_WAITOK | M_ZERO);
 	}
 
 	pf_srchashmask = pf_srchashsize - 1;
 	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++)
 		mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
 
 	/* ALTQ */
 	TAILQ_INIT(&V_pf_altqs[0]);
 	TAILQ_INIT(&V_pf_altqs[1]);
 	TAILQ_INIT(&V_pf_altqs[2]);
 	TAILQ_INIT(&V_pf_altqs[3]);
 	TAILQ_INIT(&V_pf_pabuf);
 	V_pf_altqs_active = &V_pf_altqs[0];
 	V_pf_altq_ifs_active = &V_pf_altqs[1];
 	V_pf_altqs_inactive = &V_pf_altqs[2];
 	V_pf_altq_ifs_inactive = &V_pf_altqs[3];
 
 	/* Send & overload+flush queues. */
 	STAILQ_INIT(&V_pf_sendqueue);
 	SLIST_INIT(&V_pf_overloadqueue);
 	TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet);
 
 	/* Unlinked, but may be referenced rules. */
 	TAILQ_INIT(&V_pf_unlinked_rules);
 }
 
 void
 pf_mtag_cleanup(void)
 {
 
 	uma_zdestroy(pf_mtag_z);
 }
 
 void
 pf_cleanup(void)
 {
 	struct pf_keyhash	*kh;
 	struct pf_idhash	*ih;
 	struct pf_srchash	*sh;
 	struct pf_send_entry	*pfse, *next;
 	u_int i;
 
 	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
 	    i++, kh++, ih++) {
 		KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
 		    __func__));
 		KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
 		    __func__));
 		mtx_destroy(&kh->lock);
 		mtx_destroy(&ih->lock);
 	}
 	free(V_pf_keyhash, M_PFHASH);
 	free(V_pf_idhash, M_PFHASH);
 
 	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
 		KASSERT(LIST_EMPTY(&sh->nodes),
 		    ("%s: source node hash not empty", __func__));
 		mtx_destroy(&sh->lock);
 	}
 	free(V_pf_srchash, M_PFHASH);
 
 	STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
 		m_freem(pfse->pfse_m);
 		free(pfse, M_PFTEMP);
 	}
 
 	uma_zdestroy(V_pf_sources_z);
 	uma_zdestroy(V_pf_state_z);
 	uma_zdestroy(V_pf_state_key_z);
 }
 
 static int
 pf_mtag_uminit(void *mem, int size, int how)
 {
 	struct m_tag *t;
 
 	t = (struct m_tag *)mem;
 	t->m_tag_cookie = MTAG_ABI_COMPAT;
 	t->m_tag_id = PACKET_TAG_PF;
 	t->m_tag_len = sizeof(struct pf_mtag);
 	t->m_tag_free = pf_mtag_free;
 
 	return (0);
 }
 
 static void
 pf_mtag_free(struct m_tag *t)
 {
 
 	uma_zfree(pf_mtag_z, t);
 }
 
 struct pf_mtag *
 pf_get_mtag(struct mbuf *m)
 {
 	struct m_tag *mtag;
 
 	if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
 		return ((struct pf_mtag *)(mtag + 1));
 
 	mtag = uma_zalloc(pf_mtag_z, M_NOWAIT);
 	if (mtag == NULL)
 		return (NULL);
 	bzero(mtag + 1, sizeof(struct pf_mtag));
 	m_tag_prepend(m, mtag);
 
 	return ((struct pf_mtag *)(mtag + 1));
 }
 
 static int
 pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
     struct pf_kstate *s)
 {
 	struct pf_keyhash	*khs, *khw, *kh;
 	struct pf_state_key	*sk, *cur;
 	struct pf_kstate	*si, *olds = NULL;
 	int idx;
 
 	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
 	KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
 	KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
 
 	/*
 	 * We need to lock hash slots of both keys. To avoid deadlock
 	 * we always lock the slot with lower address first. Unlock order
 	 * isn't important.
 	 *
 	 * We also need to lock ID hash slot before dropping key
 	 * locks. On success we return with ID hash slot locked.
 	 */
 
 	if (skw == sks) {
 		khs = khw = &V_pf_keyhash[pf_hashkey(skw)];
 		PF_HASHROW_LOCK(khs);
 	} else {
 		khs = &V_pf_keyhash[pf_hashkey(sks)];
 		khw = &V_pf_keyhash[pf_hashkey(skw)];
 		if (khs == khw) {
 			PF_HASHROW_LOCK(khs);
 		} else if (khs < khw) {
 			PF_HASHROW_LOCK(khs);
 			PF_HASHROW_LOCK(khw);
 		} else {
 			PF_HASHROW_LOCK(khw);
 			PF_HASHROW_LOCK(khs);
 		}
 	}
 
 #define	KEYS_UNLOCK()	do {			\
 	if (khs != khw) {			\
 		PF_HASHROW_UNLOCK(khs);		\
 		PF_HASHROW_UNLOCK(khw);		\
 	} else					\
 		PF_HASHROW_UNLOCK(khs);		\
 } while (0)
 
 	/*
 	 * First run: start with wire key.
 	 */
 	sk = skw;
 	kh = khw;
 	idx = PF_SK_WIRE;
 
 	MPASS(s->lock == NULL);
 	s->lock = &V_pf_idhash[PF_IDHASH(s)].lock;
 
 keyattach:
 	LIST_FOREACH(cur, &kh->keys, entry)
 		if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
 			break;
 
 	if (cur != NULL) {
 		/* Key exists. Check for same kif, if none, add to key. */
 		TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
 			struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
 
 			PF_HASHROW_LOCK(ih);
 			if (si->kif == s->kif &&
 			    si->direction == s->direction) {
 				if (sk->proto == IPPROTO_TCP &&
 				    si->src.state >= TCPS_FIN_WAIT_2 &&
 				    si->dst.state >= TCPS_FIN_WAIT_2) {
 					/*
 					 * New state matches an old >FIN_WAIT_2
 					 * state. We can't drop key hash locks,
 					 * thus we can't unlink it properly.
 					 *
 					 * As a workaround we drop it into
 					 * TCPS_CLOSED state, schedule purge
 					 * ASAP and push it into the very end
 					 * of the slot TAILQ, so that it won't
 					 * conflict with our new state.
 					 */
 					pf_set_protostate(si, PF_PEER_BOTH,
 					    TCPS_CLOSED);
 					si->timeout = PFTM_PURGE;
 					olds = si;
 				} else {
 					if (V_pf_status.debug >= PF_DEBUG_MISC) {
 						printf("pf: %s key attach "
 						    "failed on %s: ",
 						    (idx == PF_SK_WIRE) ?
 						    "wire" : "stack",
 						    s->kif->pfik_name);
 						pf_print_state_parts(s,
 						    (idx == PF_SK_WIRE) ?
 						    sk : NULL,
 						    (idx == PF_SK_STACK) ?
 						    sk : NULL);
 						printf(", existing: ");
 						pf_print_state_parts(si,
 						    (idx == PF_SK_WIRE) ?
 						    sk : NULL,
 						    (idx == PF_SK_STACK) ?
 						    sk : NULL);
 						printf("\n");
 					}
 					PF_HASHROW_UNLOCK(ih);
 					KEYS_UNLOCK();
 					uma_zfree(V_pf_state_key_z, sk);
 					if (idx == PF_SK_STACK)
 						pf_detach_state(s);
 					return (EEXIST); /* collision! */
 				}
 			}
 			PF_HASHROW_UNLOCK(ih);
 		}
 		uma_zfree(V_pf_state_key_z, sk);
 		s->key[idx] = cur;
 	} else {
 		LIST_INSERT_HEAD(&kh->keys, sk, entry);
 		s->key[idx] = sk;
 	}
 
 stateattach:
 	/* List is sorted, if-bound states before floating. */
 	if (s->kif == V_pfi_all)
 		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
 	else
 		TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
 
 	if (olds) {
 		TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]);
 		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds,
 		    key_list[idx]);
 		olds = NULL;
 	}
 
 	/*
 	 * Attach done. See how should we (or should not?)
 	 * attach a second key.
 	 */
 	if (sks == skw) {
 		s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
 		idx = PF_SK_STACK;
 		sks = NULL;
 		goto stateattach;
 	} else if (sks != NULL) {
 		/*
 		 * Continue attaching with stack key.
 		 */
 		sk = sks;
 		kh = khs;
 		idx = PF_SK_STACK;
 		sks = NULL;
 		goto keyattach;
 	}
 
 	PF_STATE_LOCK(s);
 	KEYS_UNLOCK();
 
 	KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
 	    ("%s failure", __func__));
 
 	return (0);
 #undef	KEYS_UNLOCK
 }
 
 static void
 pf_detach_state(struct pf_kstate *s)
 {
 	struct pf_state_key *sks = s->key[PF_SK_STACK];
 	struct pf_keyhash *kh;
 
 	if (sks != NULL) {
 		kh = &V_pf_keyhash[pf_hashkey(sks)];
 		PF_HASHROW_LOCK(kh);
 		if (s->key[PF_SK_STACK] != NULL)
 			pf_state_key_detach(s, PF_SK_STACK);
 		/*
 		 * If both point to same key, then we are done.
 		 */
 		if (sks == s->key[PF_SK_WIRE]) {
 			pf_state_key_detach(s, PF_SK_WIRE);
 			PF_HASHROW_UNLOCK(kh);
 			return;
 		}
 		PF_HASHROW_UNLOCK(kh);
 	}
 
 	if (s->key[PF_SK_WIRE] != NULL) {
 		kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
 		PF_HASHROW_LOCK(kh);
 		if (s->key[PF_SK_WIRE] != NULL)
 			pf_state_key_detach(s, PF_SK_WIRE);
 		PF_HASHROW_UNLOCK(kh);
 	}
 }
 
 static void
 pf_state_key_detach(struct pf_kstate *s, int idx)
 {
 	struct pf_state_key *sk = s->key[idx];
 #ifdef INVARIANTS
 	struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
 
 	PF_HASHROW_ASSERT(kh);
 #endif
 	TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
 	s->key[idx] = NULL;
 
 	if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
 		LIST_REMOVE(sk, entry);
 		uma_zfree(V_pf_state_key_z, sk);
 	}
 }
 
 static int
 pf_state_key_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct pf_state_key *sk = mem;
 
 	bzero(sk, sizeof(struct pf_state_key_cmp));
 	TAILQ_INIT(&sk->states[PF_SK_WIRE]);
 	TAILQ_INIT(&sk->states[PF_SK_STACK]);
 
 	return (0);
 }
 
 struct pf_state_key *
 pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
 	struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
 {
 	struct pf_state_key *sk;
 
 	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
 	if (sk == NULL)
 		return (NULL);
 
 	PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
 	PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
 	sk->port[pd->sidx] = sport;
 	sk->port[pd->didx] = dport;
 	sk->proto = pd->proto;
 	sk->af = pd->af;
 
 	return (sk);
 }
 
 struct pf_state_key *
 pf_state_key_clone(struct pf_state_key *orig)
 {
 	struct pf_state_key *sk;
 
 	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
 	if (sk == NULL)
 		return (NULL);
 
 	bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
 
 	return (sk);
 }
 
 int
 pf_state_insert(struct pfi_kkif *kif, struct pfi_kkif *orig_kif,
     struct pf_state_key *skw, struct pf_state_key *sks, struct pf_kstate *s)
 {
 	struct pf_idhash *ih;
 	struct pf_kstate *cur;
 	int error;
 
 	KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
 	    ("%s: sks not pristine", __func__));
 	KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
 	    ("%s: skw not pristine", __func__));
 	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
 
 	s->kif = kif;
 	s->orig_kif = orig_kif;
 
 	if (s->id == 0 && s->creatorid == 0) {
 		s->id = alloc_unr64(&V_pf_stateid);
 		s->id = htobe64(s->id);
 		s->creatorid = V_pf_status.hostid;
 	}
 
 	/* Returns with ID locked on success. */
 	if ((error = pf_state_key_attach(skw, sks, s)) != 0)
 		return (error);
 
 	ih = &V_pf_idhash[PF_IDHASH(s)];
 	PF_HASHROW_ASSERT(ih);
 	LIST_FOREACH(cur, &ih->states, entry)
 		if (cur->id == s->id && cur->creatorid == s->creatorid)
 			break;
 
 	if (cur != NULL) {
 		PF_HASHROW_UNLOCK(ih);
 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			printf("pf: state ID collision: "
 			    "id: %016llx creatorid: %08x\n",
 			    (unsigned long long)be64toh(s->id),
 			    ntohl(s->creatorid));
 		}
 		pf_detach_state(s);
 		return (EEXIST);
 	}
 	LIST_INSERT_HEAD(&ih->states, s, entry);
 	/* One for keys, one for ID hash. */
 	refcount_init(&s->refs, 2);
 
 	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_INSERT], 1);
 	if (V_pfsync_insert_state_ptr != NULL)
 		V_pfsync_insert_state_ptr(s);
 
 	/* Returns locked. */
 	return (0);
 }
 
 /*
  * Find state by ID: returns with locked row on success.
  */
 struct pf_kstate *
 pf_find_state_byid(uint64_t id, uint32_t creatorid)
 {
 	struct pf_idhash *ih;
 	struct pf_kstate *s;
 
 	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
 
 	ih = &V_pf_idhash[(be64toh(id) % (pf_hashmask + 1))];
 
 	PF_HASHROW_LOCK(ih);
 	LIST_FOREACH(s, &ih->states, entry)
 		if (s->id == id && s->creatorid == creatorid)
 			break;
 
 	if (s == NULL)
 		PF_HASHROW_UNLOCK(ih);
 
 	return (s);
 }
 
 /*
  * Find state by key.
  * Returns with ID hash slot locked on success.
  */
 static struct pf_kstate *
 pf_find_state(struct pfi_kkif *kif, struct pf_state_key_cmp *key, u_int dir)
 {
 	struct pf_keyhash	*kh;
 	struct pf_state_key	*sk;
 	struct pf_kstate	*s;
 	int idx;
 
 	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
 
 	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
 
 	PF_HASHROW_LOCK(kh);
 	LIST_FOREACH(sk, &kh->keys, entry)
 		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
 			break;
 	if (sk == NULL) {
 		PF_HASHROW_UNLOCK(kh);
 		return (NULL);
 	}
 
 	idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
 
 	/* List is sorted, if-bound states before floating ones. */
 	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
 		if (s->kif == V_pfi_all || s->kif == kif) {
 			PF_STATE_LOCK(s);
 			PF_HASHROW_UNLOCK(kh);
 			if (__predict_false(s->timeout >= PFTM_MAX)) {
 				/*
 				 * State is either being processed by
 				 * pf_unlink_state() in an other thread, or
 				 * is scheduled for immediate expiry.
 				 */
 				PF_STATE_UNLOCK(s);
 				return (NULL);
 			}
 			return (s);
 		}
 	PF_HASHROW_UNLOCK(kh);
 
 	return (NULL);
 }
 
 /*
  * Returns with ID hash slot locked on success.
  */
 struct pf_kstate *
 pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
 {
 	struct pf_keyhash	*kh;
 	struct pf_state_key	*sk;
 	struct pf_kstate	*s, *ret = NULL;
 	int			 idx, inout = 0;
 
 	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
 
 	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
 
 	PF_HASHROW_LOCK(kh);
 	LIST_FOREACH(sk, &kh->keys, entry)
 		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
 			break;
 	if (sk == NULL) {
 		PF_HASHROW_UNLOCK(kh);
 		return (NULL);
 	}
 	switch (dir) {
 	case PF_IN:
 		idx = PF_SK_WIRE;
 		break;
 	case PF_OUT:
 		idx = PF_SK_STACK;
 		break;
 	case PF_INOUT:
 		idx = PF_SK_WIRE;
 		inout = 1;
 		break;
 	default:
 		panic("%s: dir %u", __func__, dir);
 	}
 second_run:
 	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
 		if (more == NULL) {
 			PF_STATE_LOCK(s);
 			PF_HASHROW_UNLOCK(kh);
 			return (s);
 		}
 
 		if (ret)
 			(*more)++;
 		else {
 			ret = s;
 			PF_STATE_LOCK(s);
 		}
 	}
 	if (inout == 1) {
 		inout = 0;
 		idx = PF_SK_STACK;
 		goto second_run;
 	}
 	PF_HASHROW_UNLOCK(kh);
 
 	return (ret);
 }
 
 /*
  * FIXME
  * This routine is inefficient -- locks the state only to unlock immediately on
  * return.
  * It is racy -- after the state is unlocked nothing stops other threads from
  * removing it.
  */
 bool
 pf_find_state_all_exists(struct pf_state_key_cmp *key, u_int dir)
 {
 	struct pf_kstate *s;
 
 	s = pf_find_state_all(key, dir, NULL);
 	if (s != NULL) {
 		PF_STATE_UNLOCK(s);
 		return (true);
 	}
 	return (false);
 }
 
 /* END state table stuff */
 
 static void
 pf_send(struct pf_send_entry *pfse)
 {
 
 	PF_SENDQ_LOCK();
 	STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
 	PF_SENDQ_UNLOCK();
 	swi_sched(V_pf_swi_cookie, 0);
 }
 
 static bool
 pf_isforlocal(struct mbuf *m, int af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET: {
 		struct ip *ip = mtod(m, struct ip *);
 
 		return (in_localip(ip->ip_dst));
 	}
 #endif
 #ifdef INET6
 	case AF_INET6: {
 		struct ip6_hdr *ip6;
 		struct in6_ifaddr *ia;
 		ip6 = mtod(m, struct ip6_hdr *);
 		ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */, false);
 		if (ia == NULL)
 			return (false);
 		return (! (ia->ia6_flags & IN6_IFF_NOTREADY));
 	}
 #endif
 	default:
 		panic("Unsupported af %d", af);
 	}
 
 	return (false);
 }
 
 void
 pf_intr(void *v)
 {
 	struct epoch_tracker et;
 	struct pf_send_head queue;
 	struct pf_send_entry *pfse, *next;
 
 	CURVNET_SET((struct vnet *)v);
 
 	PF_SENDQ_LOCK();
 	queue = V_pf_sendqueue;
 	STAILQ_INIT(&V_pf_sendqueue);
 	PF_SENDQ_UNLOCK();
 
 	NET_EPOCH_ENTER(et);
 
 	STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
 		switch (pfse->pfse_type) {
 #ifdef INET
 		case PFSE_IP: {
 			if (pf_isforlocal(pfse->pfse_m, AF_INET)) {
 				pfse->pfse_m->m_flags |= M_SKIP_FIREWALL;
 				pfse->pfse_m->m_pkthdr.csum_flags |=
 				    CSUM_IP_VALID | CSUM_IP_CHECKED;
 				ip_input(pfse->pfse_m);
 			} else {
 				ip_output(pfse->pfse_m, NULL, NULL, 0, NULL,
 				    NULL);
 			}
 			break;
 		}
 		case PFSE_ICMP:
 			icmp_error(pfse->pfse_m, pfse->icmpopts.type,
 			    pfse->icmpopts.code, 0, pfse->icmpopts.mtu);
 			break;
 #endif /* INET */
 #ifdef INET6
 		case PFSE_IP6:
 			if (pf_isforlocal(pfse->pfse_m, AF_INET6)) {
 				pfse->pfse_m->m_flags |= M_SKIP_FIREWALL;
 				ip6_input(pfse->pfse_m);
 			} else {
 				ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL,
 				    NULL, NULL);
 			}
 			break;
 		case PFSE_ICMP6:
 			icmp6_error(pfse->pfse_m, pfse->icmpopts.type,
 			    pfse->icmpopts.code, pfse->icmpopts.mtu);
 			break;
 #endif /* INET6 */
 		default:
 			panic("%s: unknown type", __func__);
 		}
 		free(pfse, M_PFTEMP);
 	}
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 #define	pf_purge_thread_period	(hz / 10)
 
 #ifdef PF_WANT_32_TO_64_COUNTER
 static void
 pf_status_counter_u64_periodic(void)
 {
 
 	PF_RULES_RASSERT();
 
 	if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 60)) != 0) {
 		return;
 	}
 
 	for (int i = 0; i < FCNT_MAX; i++) {
 		pf_counter_u64_periodic(&V_pf_status.fcounters[i]);
 	}
 }
 
 static void
 pf_kif_counter_u64_periodic(void)
 {
 	struct pfi_kkif *kif;
 	size_t r, run;
 
 	PF_RULES_RASSERT();
 
 	if (__predict_false(V_pf_allkifcount == 0)) {
 		return;
 	}
 
 	if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 300)) != 0) {
 		return;
 	}
 
 	run = V_pf_allkifcount / 10;
 	if (run < 5)
 		run = 5;
 
 	for (r = 0; r < run; r++) {
 		kif = LIST_NEXT(V_pf_kifmarker, pfik_allkiflist);
 		if (kif == NULL) {
 			LIST_REMOVE(V_pf_kifmarker, pfik_allkiflist);
 			LIST_INSERT_HEAD(&V_pf_allkiflist, V_pf_kifmarker, pfik_allkiflist);
 			break;
 		}
 
 		LIST_REMOVE(V_pf_kifmarker, pfik_allkiflist);
 		LIST_INSERT_AFTER(kif, V_pf_kifmarker, pfik_allkiflist);
 
 		for (int i = 0; i < 2; i++) {
 			for (int j = 0; j < 2; j++) {
 				for (int k = 0; k < 2; k++) {
 					pf_counter_u64_periodic(&kif->pfik_packets[i][j][k]);
 					pf_counter_u64_periodic(&kif->pfik_bytes[i][j][k]);
 				}
 			}
 		}
 	}
 }
 
 static void
 pf_rule_counter_u64_periodic(void)
 {
 	struct pf_krule *rule;
 	size_t r, run;
 
 	PF_RULES_RASSERT();
 
 	if (__predict_false(V_pf_allrulecount == 0)) {
 		return;
 	}
 
 	if ((V_pf_counter_periodic_iter % (pf_purge_thread_period * 10 * 300)) != 0) {
 		return;
 	}
 
 	run = V_pf_allrulecount / 10;
 	if (run < 5)
 		run = 5;
 
 	for (r = 0; r < run; r++) {
 		rule = LIST_NEXT(V_pf_rulemarker, allrulelist);
 		if (rule == NULL) {
 			LIST_REMOVE(V_pf_rulemarker, allrulelist);
 			LIST_INSERT_HEAD(&V_pf_allrulelist, V_pf_rulemarker, allrulelist);
 			break;
 		}
 
 		LIST_REMOVE(V_pf_rulemarker, allrulelist);
 		LIST_INSERT_AFTER(rule, V_pf_rulemarker, allrulelist);
 
 		pf_counter_u64_periodic(&rule->evaluations);
 		for (int i = 0; i < 2; i++) {
 			pf_counter_u64_periodic(&rule->packets[i]);
 			pf_counter_u64_periodic(&rule->bytes[i]);
 		}
 	}
 }
 
 static void
 pf_counter_u64_periodic_main(void)
 {
 	PF_RULES_RLOCK_TRACKER;
 
 	V_pf_counter_periodic_iter++;
 
 	PF_RULES_RLOCK();
 	pf_counter_u64_critical_enter();
 	pf_status_counter_u64_periodic();
 	pf_kif_counter_u64_periodic();
 	pf_rule_counter_u64_periodic();
 	pf_counter_u64_critical_exit();
 	PF_RULES_RUNLOCK();
 }
 #else
 #define	pf_counter_u64_periodic_main()	do { } while (0)
 #endif
 
 void
 pf_purge_thread(void *unused __unused)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	sx_xlock(&pf_end_lock);
 	while (pf_end_threads == 0) {
 		sx_sleep(pf_purge_thread, &pf_end_lock, 0, "pftm", pf_purge_thread_period);
 
 		VNET_LIST_RLOCK();
 		VNET_FOREACH(vnet_iter) {
 			CURVNET_SET(vnet_iter);
 
 			/* Wait until V_pf_default_rule is initialized. */
 			if (V_pf_vnet_active == 0) {
 				CURVNET_RESTORE();
 				continue;
 			}
 
 			pf_counter_u64_periodic_main();
 
 			/*
 			 *  Process 1/interval fraction of the state
 			 * table every run.
 			 */
 			V_pf_purge_idx =
 			    pf_purge_expired_states(V_pf_purge_idx, pf_hashmask /
 			    (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
 
 			/*
 			 * Purge other expired types every
 			 * PFTM_INTERVAL seconds.
 			 */
 			if (V_pf_purge_idx == 0) {
 				/*
 				 * Order is important:
 				 * - states and src nodes reference rules
 				 * - states and rules reference kifs
 				 */
 				pf_purge_expired_fragments();
 				pf_purge_expired_src_nodes();
 				pf_purge_unlinked_rules();
 				pfi_kkif_purge();
 			}
 			CURVNET_RESTORE();
 		}
 		VNET_LIST_RUNLOCK();
 	}
 
 	pf_end_threads++;
 	sx_xunlock(&pf_end_lock);
 	kproc_exit(0);
 }
 
 void
 pf_unload_vnet_purge(void)
 {
 
 	/*
 	 * To cleanse up all kifs and rules we need
 	 * two runs: first one clears reference flags,
 	 * then pf_purge_expired_states() doesn't
 	 * raise them, and then second run frees.
 	 */
 	pf_purge_unlinked_rules();
 	pfi_kkif_purge();
 
 	/*
 	 * Now purge everything.
 	 */
 	pf_purge_expired_states(0, pf_hashmask);
 	pf_purge_fragments(UINT_MAX);
 	pf_purge_expired_src_nodes();
 
 	/*
 	 * Now all kifs & rules should be unreferenced,
 	 * thus should be successfully freed.
 	 */
 	pf_purge_unlinked_rules();
 	pfi_kkif_purge();
 }
 
 u_int32_t
 pf_state_expires(const struct pf_kstate *state)
 {
 	u_int32_t	timeout;
 	u_int32_t	start;
 	u_int32_t	end;
 	u_int32_t	states;
 
 	/* handle all PFTM_* > PFTM_MAX here */
 	if (state->timeout == PFTM_PURGE)
 		return (time_uptime);
 	KASSERT(state->timeout != PFTM_UNLINKED,
 	    ("pf_state_expires: timeout == PFTM_UNLINKED"));
 	KASSERT((state->timeout < PFTM_MAX),
 	    ("pf_state_expires: timeout > PFTM_MAX"));
 	timeout = state->rule.ptr->timeout[state->timeout];
 	if (!timeout)
 		timeout = V_pf_default_rule.timeout[state->timeout];
 	start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
 	if (start && state->rule.ptr != &V_pf_default_rule) {
 		end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
 		states = counter_u64_fetch(state->rule.ptr->states_cur);
 	} else {
 		start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
 		end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
 		states = V_pf_status.states;
 	}
 	if (end && states > start && start < end) {
 		if (states < end) {
 			timeout = (u_int64_t)timeout * (end - states) /
 			    (end - start);
 			return (state->expire + timeout);
 		}
 		else
 			return (time_uptime);
 	}
 	return (state->expire + timeout);
 }
 
 void
 pf_purge_expired_src_nodes(void)
 {
 	struct pf_ksrc_node_list	 freelist;
 	struct pf_srchash	*sh;
 	struct pf_ksrc_node	*cur, *next;
 	int i;
 
 	LIST_INIT(&freelist);
 	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
 	    PF_HASHROW_LOCK(sh);
 	    LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
 		if (cur->states == 0 && cur->expire <= time_uptime) {
 			pf_unlink_src_node(cur);
 			LIST_INSERT_HEAD(&freelist, cur, entry);
 		} else if (cur->rule.ptr != NULL)
 			cur->rule.ptr->rule_ref |= PFRULE_REFS;
 	    PF_HASHROW_UNLOCK(sh);
 	}
 
 	pf_free_src_nodes(&freelist);
 
 	V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z);
 }
 
 static void
 pf_src_tree_remove_state(struct pf_kstate *s)
 {
 	struct pf_ksrc_node *sn;
 	struct pf_srchash *sh;
 	uint32_t timeout;
 
 	timeout = s->rule.ptr->timeout[PFTM_SRC_NODE] ?
 	    s->rule.ptr->timeout[PFTM_SRC_NODE] :
 	    V_pf_default_rule.timeout[PFTM_SRC_NODE];
 
 	if (s->src_node != NULL) {
 		sn = s->src_node;
 		sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
 	    	PF_HASHROW_LOCK(sh);
 		if (s->src.tcp_est)
 			--sn->conn;
 		if (--sn->states == 0)
 			sn->expire = time_uptime + timeout;
 	    	PF_HASHROW_UNLOCK(sh);
 	}
 	if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
 		sn = s->nat_src_node;
 		sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
 	    	PF_HASHROW_LOCK(sh);
 		if (--sn->states == 0)
 			sn->expire = time_uptime + timeout;
 	    	PF_HASHROW_UNLOCK(sh);
 	}
 	s->src_node = s->nat_src_node = NULL;
 }
 
 /*
  * Unlink and potentilly free a state. Function may be
  * called with ID hash row locked, but always returns
  * unlocked, since it needs to go through key hash locking.
  */
 int
 pf_unlink_state(struct pf_kstate *s)
 {
 	struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
 
 	PF_HASHROW_ASSERT(ih);
 
 	if (s->timeout == PFTM_UNLINKED) {
 		/*
 		 * State is being processed
 		 * by pf_unlink_state() in
 		 * an other thread.
 		 */
 		PF_HASHROW_UNLOCK(ih);
 		return (0);	/* XXXGL: undefined actually */
 	}
 
 	if (s->src.state == PF_TCPS_PROXY_DST) {
 		/* XXX wire key the right one? */
 		pf_send_tcp(s->rule.ptr, s->key[PF_SK_WIRE]->af,
 		    &s->key[PF_SK_WIRE]->addr[1],
 		    &s->key[PF_SK_WIRE]->addr[0],
 		    s->key[PF_SK_WIRE]->port[1],
 		    s->key[PF_SK_WIRE]->port[0],
 		    s->src.seqhi, s->src.seqlo + 1,
 		    TH_RST|TH_ACK, 0, 0, 0, 1, s->tag);
 	}
 
 	LIST_REMOVE(s, entry);
 	pf_src_tree_remove_state(s);
 
 	if (V_pfsync_delete_state_ptr != NULL)
 		V_pfsync_delete_state_ptr(s);
 
 	STATE_DEC_COUNTERS(s);
 
 	s->timeout = PFTM_UNLINKED;
 
 	/* Ensure we remove it from the list of halfopen states, if needed. */
 	if (s->key[PF_SK_STACK] != NULL &&
 	    s->key[PF_SK_STACK]->proto == IPPROTO_TCP)
 		pf_set_protostate(s, PF_PEER_BOTH, TCPS_CLOSED);
 
 	PF_HASHROW_UNLOCK(ih);
 
 	pf_detach_state(s);
 	/* pf_state_insert() initialises refs to 2 */
 	return (pf_release_staten(s, 2));
 }
 
 struct pf_kstate *
 pf_alloc_state(int flags)
 {
 
 	return (uma_zalloc(V_pf_state_z, flags | M_ZERO));
 }
 
 void
 pf_free_state(struct pf_kstate *cur)
 {
 
 	KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
 	KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
 	    cur->timeout));
 
 	pf_normalize_tcp_cleanup(cur);
 	uma_zfree(V_pf_state_z, cur);
 	pf_counter_u64_add(&V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1);
 }
 
 /*
  * Called only from pf_purge_thread(), thus serialized.
  */
 static u_int
 pf_purge_expired_states(u_int i, int maxcheck)
 {
 	struct pf_idhash *ih;
 	struct pf_kstate *s;
 
 	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
 
 	/*
 	 * Go through hash and unlink states that expire now.
 	 */
 	while (maxcheck > 0) {
 		ih = &V_pf_idhash[i];
 
 		/* only take the lock if we expect to do work */
 		if (!LIST_EMPTY(&ih->states)) {
 relock:
 			PF_HASHROW_LOCK(ih);
 			LIST_FOREACH(s, &ih->states, entry) {
 				if (pf_state_expires(s) <= time_uptime) {
 					V_pf_status.states -=
 					    pf_unlink_state(s);
 					goto relock;
 				}
 				s->rule.ptr->rule_ref |= PFRULE_REFS;
 				if (s->nat_rule.ptr != NULL)
 					s->nat_rule.ptr->rule_ref |= PFRULE_REFS;
 				if (s->anchor.ptr != NULL)
 					s->anchor.ptr->rule_ref |= PFRULE_REFS;
 				s->kif->pfik_flags |= PFI_IFLAG_REFS;
 				if (s->rt_kif)
 					s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
 			}
 			PF_HASHROW_UNLOCK(ih);
 		}
 
 		/* Return when we hit end of hash. */
 		if (++i > pf_hashmask) {
 			V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
 			return (0);
 		}
 
 		maxcheck--;
 	}
 
 	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
 
 	return (i);
 }
 
 static void
 pf_purge_unlinked_rules(void)
 {
 	struct pf_krulequeue tmpq;
 	struct pf_krule *r, *r1;
 
 	/*
 	 * If we have overloading task pending, then we'd
 	 * better skip purging this time. There is a tiny
 	 * probability that overloading task references
 	 * an already unlinked rule.
 	 */
 	PF_OVERLOADQ_LOCK();
 	if (!SLIST_EMPTY(&V_pf_overloadqueue)) {
 		PF_OVERLOADQ_UNLOCK();
 		return;
 	}
 	PF_OVERLOADQ_UNLOCK();
 
 	/*
 	 * Do naive mark-and-sweep garbage collecting of old rules.
 	 * Reference flag is raised by pf_purge_expired_states()
 	 * and pf_purge_expired_src_nodes().
 	 *
 	 * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
 	 * use a temporary queue.
 	 */
 	TAILQ_INIT(&tmpq);
 	PF_UNLNKDRULES_LOCK();
 	TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
 		if (!(r->rule_ref & PFRULE_REFS)) {
 			TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
 			TAILQ_INSERT_TAIL(&tmpq, r, entries);
 		} else
 			r->rule_ref &= ~PFRULE_REFS;
 	}
 	PF_UNLNKDRULES_UNLOCK();
 
 	if (!TAILQ_EMPTY(&tmpq)) {
 		PF_CONFIG_LOCK();
 		PF_RULES_WLOCK();
 		TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
 			TAILQ_REMOVE(&tmpq, r, entries);
 			pf_free_rule(r);
 		}
 		PF_RULES_WUNLOCK();
 		PF_CONFIG_UNLOCK();
 	}
 }
 
 void
 pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET: {
 		u_int32_t a = ntohl(addr->addr32[0]);
 		printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
 		    (a>>8)&255, a&255);
 		if (p) {
 			p = ntohs(p);
 			printf(":%u", p);
 		}
 		break;
 	}
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6: {
 		u_int16_t b;
 		u_int8_t i, curstart, curend, maxstart, maxend;
 		curstart = curend = maxstart = maxend = 255;
 		for (i = 0; i < 8; i++) {
 			if (!addr->addr16[i]) {
 				if (curstart == 255)
 					curstart = i;
 				curend = i;
 			} else {
 				if ((curend - curstart) >
 				    (maxend - maxstart)) {
 					maxstart = curstart;
 					maxend = curend;
 				}
 				curstart = curend = 255;
 			}
 		}
 		if ((curend - curstart) >
 		    (maxend - maxstart)) {
 			maxstart = curstart;
 			maxend = curend;
 		}
 		for (i = 0; i < 8; i++) {
 			if (i >= maxstart && i <= maxend) {
 				if (i == 0)
 					printf(":");
 				if (i == maxend)
 					printf(":");
 			} else {
 				b = ntohs(addr->addr16[i]);
 				printf("%x", b);
 				if (i < 7)
 					printf(":");
 			}
 		}
 		if (p) {
 			p = ntohs(p);
 			printf("[%u]", p);
 		}
 		break;
 	}
 #endif /* INET6 */
 	}
 }
 
 void
 pf_print_state(struct pf_kstate *s)
 {
 	pf_print_state_parts(s, NULL, NULL);
 }
 
 static void
 pf_print_state_parts(struct pf_kstate *s,
     struct pf_state_key *skwp, struct pf_state_key *sksp)
 {
 	struct pf_state_key *skw, *sks;
 	u_int8_t proto, dir;
 
 	/* Do our best to fill these, but they're skipped if NULL */
 	skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
 	sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
 	proto = skw ? skw->proto : (sks ? sks->proto : 0);
 	dir = s ? s->direction : 0;
 
 	switch (proto) {
 	case IPPROTO_IPV4:
 		printf("IPv4");
 		break;
 	case IPPROTO_IPV6:
 		printf("IPv6");
 		break;
 	case IPPROTO_TCP:
 		printf("TCP");
 		break;
 	case IPPROTO_UDP:
 		printf("UDP");
 		break;
 	case IPPROTO_ICMP:
 		printf("ICMP");
 		break;
 	case IPPROTO_ICMPV6:
 		printf("ICMPv6");
 		break;
 	default:
 		printf("%u", proto);
 		break;
 	}
 	switch (dir) {
 	case PF_IN:
 		printf(" in");
 		break;
 	case PF_OUT:
 		printf(" out");
 		break;
 	}
 	if (skw) {
 		printf(" wire: ");
 		pf_print_host(&skw->addr[0], skw->port[0], skw->af);
 		printf(" ");
 		pf_print_host(&skw->addr[1], skw->port[1], skw->af);
 	}
 	if (sks) {
 		printf(" stack: ");
 		if (sks != skw) {
 			pf_print_host(&sks->addr[0], sks->port[0], sks->af);
 			printf(" ");
 			pf_print_host(&sks->addr[1], sks->port[1], sks->af);
 		} else
 			printf("-");
 	}
 	if (s) {
 		if (proto == IPPROTO_TCP) {
 			printf(" [lo=%u high=%u win=%u modulator=%u",
 			    s->src.seqlo, s->src.seqhi,
 			    s->src.max_win, s->src.seqdiff);
 			if (s->src.wscale && s->dst.wscale)
 				printf(" wscale=%u",
 				    s->src.wscale & PF_WSCALE_MASK);
 			printf("]");
 			printf(" [lo=%u high=%u win=%u modulator=%u",
 			    s->dst.seqlo, s->dst.seqhi,
 			    s->dst.max_win, s->dst.seqdiff);
 			if (s->src.wscale && s->dst.wscale)
 				printf(" wscale=%u",
 				s->dst.wscale & PF_WSCALE_MASK);
 			printf("]");
 		}
 		printf(" %u:%u", s->src.state, s->dst.state);
 	}
 }
 
 void
 pf_print_flags(u_int8_t f)
 {
 	if (f)
 		printf(" ");
 	if (f & TH_FIN)
 		printf("F");
 	if (f & TH_SYN)
 		printf("S");
 	if (f & TH_RST)
 		printf("R");
 	if (f & TH_PUSH)
 		printf("P");
 	if (f & TH_ACK)
 		printf("A");
 	if (f & TH_URG)
 		printf("U");
 	if (f & TH_ECE)
 		printf("E");
 	if (f & TH_CWR)
 		printf("W");
 }
 
 #define	PF_SET_SKIP_STEPS(i)					\
 	do {							\
 		while (head[i] != cur) {			\
 			head[i]->skip[i].ptr = cur;		\
 			head[i] = TAILQ_NEXT(head[i], entries);	\
 		}						\
 	} while (0)
 
 void
 pf_calc_skip_steps(struct pf_krulequeue *rules)
 {
 	struct pf_krule *cur, *prev, *head[PF_SKIP_COUNT];
 	int i;
 
 	cur = TAILQ_FIRST(rules);
 	prev = cur;
 	for (i = 0; i < PF_SKIP_COUNT; ++i)
 		head[i] = cur;
 	while (cur != NULL) {
 		if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
 			PF_SET_SKIP_STEPS(PF_SKIP_IFP);
 		if (cur->direction != prev->direction)
 			PF_SET_SKIP_STEPS(PF_SKIP_DIR);
 		if (cur->af != prev->af)
 			PF_SET_SKIP_STEPS(PF_SKIP_AF);
 		if (cur->proto != prev->proto)
 			PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
 		if (cur->src.neg != prev->src.neg ||
 		    pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
 			PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
 		if (cur->src.port[0] != prev->src.port[0] ||
 		    cur->src.port[1] != prev->src.port[1] ||
 		    cur->src.port_op != prev->src.port_op)
 			PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
 		if (cur->dst.neg != prev->dst.neg ||
 		    pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
 			PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
 		if (cur->dst.port[0] != prev->dst.port[0] ||
 		    cur->dst.port[1] != prev->dst.port[1] ||
 		    cur->dst.port_op != prev->dst.port_op)
 			PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
 
 		prev = cur;
 		cur = TAILQ_NEXT(cur, entries);
 	}
 	for (i = 0; i < PF_SKIP_COUNT; ++i)
 		PF_SET_SKIP_STEPS(i);
 }
 
 static int
 pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
 {
 	if (aw1->type != aw2->type)
 		return (1);
 	switch (aw1->type) {
 	case PF_ADDR_ADDRMASK:
 	case PF_ADDR_RANGE:
 		if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6))
 			return (1);
 		if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6))
 			return (1);
 		return (0);
 	case PF_ADDR_DYNIFTL:
 		return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
 	case PF_ADDR_NOROUTE:
 	case PF_ADDR_URPFFAILED:
 		return (0);
 	case PF_ADDR_TABLE:
 		return (aw1->p.tbl != aw2->p.tbl);
 	default:
 		printf("invalid address type: %d\n", aw1->type);
 		return (1);
 	}
 }
 
 /**
  * Checksum updates are a little complicated because the checksum in the TCP/UDP
  * header isn't always a full checksum. In some cases (i.e. output) it's a
  * pseudo-header checksum, which is a partial checksum over src/dst IP
  * addresses, protocol number and length.
  *
  * That means we have the following cases:
  *  * Input or forwarding: we don't have TSO, the checksum fields are full
  *  	checksums, we need to update the checksum whenever we change anything.
  *  * Output (i.e. the checksum is a pseudo-header checksum):
  *  	x The field being updated is src/dst address or affects the length of
  *  	the packet. We need to update the pseudo-header checksum (note that this
  *  	checksum is not ones' complement).
  *  	x Some other field is being modified (e.g. src/dst port numbers): We
  *  	don't have to update anything.
  **/
 u_int16_t
 pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
 {
 	u_int32_t x;
 
 	x = cksum + old - new;
 	x = (x + (x >> 16)) & 0xffff;
 
 	/* optimise: eliminate a branch when not udp */
 	if (udp && cksum == 0x0000)
 		return cksum;
 	if (udp && x == 0x0000)
 		x = 0xffff;
 
 	return (u_int16_t)(x);
 }
 
 static void
 pf_patch_8(struct mbuf *m, u_int16_t *cksum, u_int8_t *f, u_int8_t v, bool hi,
     u_int8_t udp)
 {
 	u_int16_t old = htons(hi ? (*f << 8) : *f);
 	u_int16_t new = htons(hi ? ( v << 8) :  v);
 
 	if (*f == v)
 		return;
 
 	*f = v;
 
 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
 		return;
 
 	*cksum = pf_cksum_fixup(*cksum, old, new, udp);
 }
 
 void
 pf_patch_16_unaligned(struct mbuf *m, u_int16_t *cksum, void *f, u_int16_t v,
     bool hi, u_int8_t udp)
 {
 	u_int8_t *fb = (u_int8_t *)f;
 	u_int8_t *vb = (u_int8_t *)&v;
 
 	pf_patch_8(m, cksum, fb++, *vb++, hi, udp);
 	pf_patch_8(m, cksum, fb++, *vb++, !hi, udp);
 }
 
 void
 pf_patch_32_unaligned(struct mbuf *m, u_int16_t *cksum, void *f, u_int32_t v,
     bool hi, u_int8_t udp)
 {
 	u_int8_t *fb = (u_int8_t *)f;
 	u_int8_t *vb = (u_int8_t *)&v;
 
 	pf_patch_8(m, cksum, fb++, *vb++, hi, udp);
 	pf_patch_8(m, cksum, fb++, *vb++, !hi, udp);
 	pf_patch_8(m, cksum, fb++, *vb++, hi, udp);
 	pf_patch_8(m, cksum, fb++, *vb++, !hi, udp);
 }
 
 u_int16_t
 pf_proto_cksum_fixup(struct mbuf *m, u_int16_t cksum, u_int16_t old,
         u_int16_t new, u_int8_t udp)
 {
 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
 		return (cksum);
 
 	return (pf_cksum_fixup(cksum, old, new, udp));
 }
 
 static void
 pf_change_ap(struct mbuf *m, struct pf_addr *a, u_int16_t *p, u_int16_t *ic,
         u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u,
         sa_family_t af)
 {
 	struct pf_addr	ao;
 	u_int16_t	po = *p;
 
 	PF_ACPY(&ao, a, af);
 	PF_ACPY(a, an, af);
 
 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
 		*pc = ~*pc;
 
 	*p = pn;
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
 		    ao.addr16[0], an->addr16[0], 0),
 		    ao.addr16[1], an->addr16[1], 0);
 		*p = pn;
 
 		*pc = pf_cksum_fixup(pf_cksum_fixup(*pc,
 		    ao.addr16[0], an->addr16[0], u),
 		    ao.addr16[1], an->addr16[1], u);
 
 		*pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 		    pf_cksum_fixup(pf_cksum_fixup(*pc,
 		    ao.addr16[0], an->addr16[0], u),
 		    ao.addr16[1], an->addr16[1], u),
 		    ao.addr16[2], an->addr16[2], u),
 		    ao.addr16[3], an->addr16[3], u),
 		    ao.addr16[4], an->addr16[4], u),
 		    ao.addr16[5], an->addr16[5], u),
 		    ao.addr16[6], an->addr16[6], u),
 		    ao.addr16[7], an->addr16[7], u);
 
 		*pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
 		break;
 #endif /* INET6 */
 	}
 
 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | 
 	    CSUM_DELAY_DATA_IPV6)) {
 		*pc = ~*pc;
 		if (! *pc)
 			*pc = 0xffff;
 	}
 }
 
 /* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
 void
 pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
 {
 	u_int32_t	ao;
 
 	memcpy(&ao, a, sizeof(ao));
 	memcpy(a, &an, sizeof(u_int32_t));
 	*c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
 	    ao % 65536, an % 65536, u);
 }
 
 void
 pf_change_proto_a(struct mbuf *m, void *a, u_int16_t *c, u_int32_t an, u_int8_t udp)
 {
 	u_int32_t	ao;
 
 	memcpy(&ao, a, sizeof(ao));
 	memcpy(a, &an, sizeof(u_int32_t));
 
 	*c = pf_proto_cksum_fixup(m,
 	    pf_proto_cksum_fixup(m, *c, ao / 65536, an / 65536, udp),
 	    ao % 65536, an % 65536, udp);
 }
 
 #ifdef INET6
 static void
 pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
 {
 	struct pf_addr	ao;
 
 	PF_ACPY(&ao, a, AF_INET6);
 	PF_ACPY(a, an, AF_INET6);
 
 	*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 	    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 	    pf_cksum_fixup(pf_cksum_fixup(*c,
 	    ao.addr16[0], an->addr16[0], u),
 	    ao.addr16[1], an->addr16[1], u),
 	    ao.addr16[2], an->addr16[2], u),
 	    ao.addr16[3], an->addr16[3], u),
 	    ao.addr16[4], an->addr16[4], u),
 	    ao.addr16[5], an->addr16[5], u),
 	    ao.addr16[6], an->addr16[6], u),
 	    ao.addr16[7], an->addr16[7], u);
 }
 #endif /* INET6 */
 
 static void
 pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
     struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
     u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
 {
 	struct pf_addr	oia, ooa;
 
 	PF_ACPY(&oia, ia, af);
 	if (oa)
 		PF_ACPY(&ooa, oa, af);
 
 	/* Change inner protocol port, fix inner protocol checksum. */
 	if (ip != NULL) {
 		u_int16_t	oip = *ip;
 		u_int32_t	opc;
 
 		if (pc != NULL)
 			opc = *pc;
 		*ip = np;
 		if (pc != NULL)
 			*pc = pf_cksum_fixup(*pc, oip, *ip, u);
 		*ic = pf_cksum_fixup(*ic, oip, *ip, 0);
 		if (pc != NULL)
 			*ic = pf_cksum_fixup(*ic, opc, *pc, 0);
 	}
 	/* Change inner ip address, fix inner ip and icmp checksums. */
 	PF_ACPY(ia, na, af);
 	switch (af) {
 #ifdef INET
 	case AF_INET: {
 		u_int32_t	 oh2c = *h2c;
 
 		*h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
 		    oia.addr16[0], ia->addr16[0], 0),
 		    oia.addr16[1], ia->addr16[1], 0);
 		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
 		    oia.addr16[0], ia->addr16[0], 0),
 		    oia.addr16[1], ia->addr16[1], 0);
 		*ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
 		break;
 	}
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 		    pf_cksum_fixup(pf_cksum_fixup(*ic,
 		    oia.addr16[0], ia->addr16[0], u),
 		    oia.addr16[1], ia->addr16[1], u),
 		    oia.addr16[2], ia->addr16[2], u),
 		    oia.addr16[3], ia->addr16[3], u),
 		    oia.addr16[4], ia->addr16[4], u),
 		    oia.addr16[5], ia->addr16[5], u),
 		    oia.addr16[6], ia->addr16[6], u),
 		    oia.addr16[7], ia->addr16[7], u);
 		break;
 #endif /* INET6 */
 	}
 	/* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
 	if (oa) {
 		PF_ACPY(oa, na, af);
 		switch (af) {
 #ifdef INET
 		case AF_INET:
 			*hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
 			    ooa.addr16[0], oa->addr16[0], 0),
 			    ooa.addr16[1], oa->addr16[1], 0);
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 			    pf_cksum_fixup(pf_cksum_fixup(*ic,
 			    ooa.addr16[0], oa->addr16[0], u),
 			    ooa.addr16[1], oa->addr16[1], u),
 			    ooa.addr16[2], oa->addr16[2], u),
 			    ooa.addr16[3], oa->addr16[3], u),
 			    ooa.addr16[4], oa->addr16[4], u),
 			    ooa.addr16[5], oa->addr16[5], u),
 			    ooa.addr16[6], oa->addr16[6], u),
 			    ooa.addr16[7], oa->addr16[7], u);
 			break;
 #endif /* INET6 */
 		}
 	}
 }
 
 /*
  * Need to modulate the sequence numbers in the TCP SACK option
  * (credits to Krzysztof Pfaff for report and patch)
  */
 static int
 pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
     struct tcphdr *th, struct pf_state_peer *dst)
 {
 	int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
 	u_int8_t opts[TCP_MAXOLEN], *opt = opts;
 	int copyback = 0, i, olen;
 	struct sackblk sack;
 
 #define	TCPOLEN_SACKLEN	(TCPOLEN_SACK + 2)
 	if (hlen < TCPOLEN_SACKLEN ||
 	    !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
 		return 0;
 
 	while (hlen >= TCPOLEN_SACKLEN) {
 		size_t startoff = opt - opts;
 		olen = opt[1];
 		switch (*opt) {
 		case TCPOPT_EOL:	/* FALLTHROUGH */
 		case TCPOPT_NOP:
 			opt++;
 			hlen--;
 			break;
 		case TCPOPT_SACK:
 			if (olen > hlen)
 				olen = hlen;
 			if (olen >= TCPOLEN_SACKLEN) {
 				for (i = 2; i + TCPOLEN_SACK <= olen;
 				    i += TCPOLEN_SACK) {
 					memcpy(&sack, &opt[i], sizeof(sack));
 					pf_patch_32_unaligned(m,
 					    &th->th_sum, &sack.start,
 					    htonl(ntohl(sack.start) - dst->seqdiff),
 					    PF_ALGNMNT(startoff),
 					    0);
 					pf_patch_32_unaligned(m, &th->th_sum,
 					    &sack.end,
 					    htonl(ntohl(sack.end) - dst->seqdiff),
 					    PF_ALGNMNT(startoff),
 					    0);
 					memcpy(&opt[i], &sack, sizeof(sack));
 				}
 				copyback = 1;
 			}
 			/* FALLTHROUGH */
 		default:
 			if (olen < 2)
 				olen = 2;
 			hlen -= olen;
 			opt += olen;
 		}
 	}
 
 	if (copyback)
 		m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
 	return (copyback);
 }
 
 struct mbuf *
 pf_build_tcp(const struct pf_krule *r, sa_family_t af,
     const struct pf_addr *saddr, const struct pf_addr *daddr,
     u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
     u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
     u_int16_t rtag)
 {
 	struct mbuf	*m;
 	int		 len, tlen;
 #ifdef INET
 	struct ip	*h = NULL;
 #endif /* INET */
 #ifdef INET6
 	struct ip6_hdr	*h6 = NULL;
 #endif /* INET6 */
 	struct tcphdr	*th;
 	char		*opt;
 	struct pf_mtag  *pf_mtag;
 
 	len = 0;
 	th = NULL;
 
 	/* maximum segment size tcp option */
 	tlen = sizeof(struct tcphdr);
 	if (mss)
 		tlen += 4;
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		len = sizeof(struct ip) + tlen;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		len = sizeof(struct ip6_hdr) + tlen;
 		break;
 #endif /* INET6 */
 	default:
 		panic("%s: unsupported af %d", __func__, af);
 	}
 
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 #ifdef MAC
 	mac_netinet_firewall_send(m);
 #endif
 	if ((pf_mtag = pf_get_mtag(m)) == NULL) {
 		m_freem(m);
 		return (NULL);
 	}
 	if (tag)
 		m->m_flags |= M_SKIP_FIREWALL;
 	pf_mtag->tag = rtag;
 
 	if (r != NULL && r->rtableid >= 0)
 		M_SETFIB(m, r->rtableid);
 
 #ifdef ALTQ
 	if (r != NULL && r->qid) {
 		pf_mtag->qid = r->qid;
 
 		/* add hints for ecn */
 		pf_mtag->hdr = mtod(m, struct ip *);
 	}
 #endif /* ALTQ */
 	m->m_data += max_linkhdr;
 	m->m_pkthdr.len = m->m_len = len;
 	/* The rest of the stack assumes a rcvif, so provide one.
 	 * This is a locally generated packet, so .. close enough. */
 	m->m_pkthdr.rcvif = V_loif;
 	bzero(m->m_data, len);
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		h = mtod(m, struct ip *);
 
 		/* IP header fields included in the TCP checksum */
 		h->ip_p = IPPROTO_TCP;
 		h->ip_len = htons(tlen);
 		h->ip_src.s_addr = saddr->v4.s_addr;
 		h->ip_dst.s_addr = daddr->v4.s_addr;
 
 		th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		h6 = mtod(m, struct ip6_hdr *);
 
 		/* IP header fields included in the TCP checksum */
 		h6->ip6_nxt = IPPROTO_TCP;
 		h6->ip6_plen = htons(tlen);
 		memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
 		memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
 
 		th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
 		break;
 #endif /* INET6 */
 	}
 
 	/* TCP header */
 	th->th_sport = sport;
 	th->th_dport = dport;
 	th->th_seq = htonl(seq);
 	th->th_ack = htonl(ack);
 	th->th_off = tlen >> 2;
 	th->th_flags = flags;
 	th->th_win = htons(win);
 
 	if (mss) {
 		opt = (char *)(th + 1);
 		opt[0] = TCPOPT_MAXSEG;
 		opt[1] = 4;
 		HTONS(mss);
 		bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
 	}
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		/* TCP checksum */
 		th->th_sum = in_cksum(m, len);
 
 		/* Finish the IP header */
 		h->ip_v = 4;
 		h->ip_hl = sizeof(*h) >> 2;
 		h->ip_tos = IPTOS_LOWDELAY;
 		h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0);
 		h->ip_len = htons(len);
 		h->ip_ttl = ttl ? ttl : V_ip_defttl;
 		h->ip_sum = 0;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		/* TCP checksum */
 		th->th_sum = in6_cksum(m, IPPROTO_TCP,
 		    sizeof(struct ip6_hdr), tlen);
 
 		h6->ip6_vfc |= IPV6_VERSION;
 		h6->ip6_hlim = IPV6_DEFHLIM;
 		break;
 #endif /* INET6 */
 	}
 
 	return (m);
 }
 
 void
 pf_send_tcp(const struct pf_krule *r, sa_family_t af,
     const struct pf_addr *saddr, const struct pf_addr *daddr,
     u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
     u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
     u_int16_t rtag)
 {
 	struct pf_send_entry *pfse;
 	struct mbuf	*m;
 
 	m = pf_build_tcp(r, af, saddr, daddr, sport, dport, seq, ack, flags,
 	    win, mss, ttl, tag, rtag);
 	if (m == NULL)
 		return;
 
 	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
 	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
 	if (pfse == NULL) {
 		m_freem(m);
 		return;
 	}
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		pfse->pfse_type = PFSE_IP;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		pfse->pfse_type = PFSE_IP6;
 		break;
 #endif /* INET6 */
 	}
 
 	pfse->pfse_m = m;
 	pf_send(pfse);
 }
 
 static void
 pf_return(struct pf_krule *r, struct pf_krule *nr, struct pf_pdesc *pd,
     struct pf_state_key *sk, int off, struct mbuf *m, struct tcphdr *th,
     struct pfi_kkif *kif, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen,
     u_short *reason)
 {
 	struct pf_addr	* const saddr = pd->src;
 	struct pf_addr	* const daddr = pd->dst;
 	sa_family_t	 af = pd->af;
 
 	/* undo NAT changes, if they have taken place */
 	if (nr != NULL) {
 		PF_ACPY(saddr, &sk->addr[pd->sidx], af);
 		PF_ACPY(daddr, &sk->addr[pd->didx], af);
 		if (pd->sport)
 			*pd->sport = sk->port[pd->sidx];
 		if (pd->dport)
 			*pd->dport = sk->port[pd->didx];
 		if (pd->proto_sum)
 			*pd->proto_sum = bproto_sum;
 		if (pd->ip_sum)
 			*pd->ip_sum = bip_sum;
 		m_copyback(m, off, hdrlen, pd->hdr.any);
 	}
 	if (pd->proto == IPPROTO_TCP &&
 	    ((r->rule_flag & PFRULE_RETURNRST) ||
 	    (r->rule_flag & PFRULE_RETURN)) &&
 	    !(th->th_flags & TH_RST)) {
 		u_int32_t	 ack = ntohl(th->th_seq) + pd->p_len;
 		int		 len = 0;
 #ifdef INET
 		struct ip	*h4;
 #endif
 #ifdef INET6
 		struct ip6_hdr	*h6;
 #endif
 
 		switch (af) {
 #ifdef INET
 		case AF_INET:
 			h4 = mtod(m, struct ip *);
 			len = ntohs(h4->ip_len) - off;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			h6 = mtod(m, struct ip6_hdr *);
 			len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
 			break;
 #endif
 		}
 
 		if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
 			REASON_SET(reason, PFRES_PROTCKSUM);
 		else {
 			if (th->th_flags & TH_SYN)
 				ack++;
 			if (th->th_flags & TH_FIN)
 				ack++;
 			pf_send_tcp(r, af, pd->dst,
 				pd->src, th->th_dport, th->th_sport,
 				ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
 				r->return_ttl, 1, 0);
 		}
 	} else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
 		r->return_icmp)
 		pf_send_icmp(m, r->return_icmp >> 8,
 			r->return_icmp & 255, af, r);
 	else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
 		r->return_icmp6)
 		pf_send_icmp(m, r->return_icmp6 >> 8,
 			r->return_icmp6 & 255, af, r);
 }
 
 static int
 pf_match_ieee8021q_pcp(u_int8_t prio, struct mbuf *m)
 {
 	struct m_tag *mtag;
 	u_int8_t mpcp;
 
 	mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL);
 	if (mtag == NULL)
 		return (0);
 
 	if (prio == PF_PRIO_ZERO)
 		prio = 0;
 
 	mpcp = *(uint8_t *)(mtag + 1);
 
 	return (mpcp == prio);
 }
 
 static int
 pf_icmp_to_bandlim(uint8_t type)
 {
 	switch (type) {
 		case ICMP_ECHO:
 		case ICMP_ECHOREPLY:
 			return (BANDLIM_ICMP_ECHO);
 		case ICMP_TSTAMP:
 		case ICMP_TSTAMPREPLY:
 			return (BANDLIM_ICMP_TSTAMP);
 		case ICMP_UNREACH:
 		default:
 			return (BANDLIM_ICMP_UNREACH);
 	}
 }
 
 static void
 pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
     struct pf_krule *r)
 {
 	struct pf_send_entry *pfse;
 	struct mbuf *m0;
 	struct pf_mtag *pf_mtag;
 
 	/* ICMP packet rate limitation. */
 #ifdef INET6
 	if (af == AF_INET6) {
 		if (icmp6_ratelimit(NULL, type, code))
 			return;
 	}
 #endif
 #ifdef INET
 	if (af == AF_INET) {
 		if (badport_bandlim(pf_icmp_to_bandlim(type)) != 0)
 			return;
 	}
 #endif
 
 	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
 	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
 	if (pfse == NULL)
 		return;
 
 	if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
 		free(pfse, M_PFTEMP);
 		return;
 	}
 
 	if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
 		free(pfse, M_PFTEMP);
 		return;
 	}
 	/* XXX: revisit */
 	m0->m_flags |= M_SKIP_FIREWALL;
 
 	if (r->rtableid >= 0)
 		M_SETFIB(m0, r->rtableid);
 
 #ifdef ALTQ
 	if (r->qid) {
 		pf_mtag->qid = r->qid;
 		/* add hints for ecn */
 		pf_mtag->hdr = mtod(m0, struct ip *);
 	}
 #endif /* ALTQ */
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		pfse->pfse_type = PFSE_ICMP;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		pfse->pfse_type = PFSE_ICMP6;
 		break;
 #endif /* INET6 */
 	}
 	pfse->pfse_m = m0;
 	pfse->icmpopts.type = type;
 	pfse->icmpopts.code = code;
 	pf_send(pfse);
 }
 
 /*
  * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
  * If n is 0, they match if they are equal. If n is != 0, they match if they
  * are different.
  */
 int
 pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
     struct pf_addr *b, sa_family_t af)
 {
 	int	match = 0;
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		if ((a->addr32[0] & m->addr32[0]) ==
 		    (b->addr32[0] & m->addr32[0]))
 			match++;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		if (((a->addr32[0] & m->addr32[0]) ==
 		     (b->addr32[0] & m->addr32[0])) &&
 		    ((a->addr32[1] & m->addr32[1]) ==
 		     (b->addr32[1] & m->addr32[1])) &&
 		    ((a->addr32[2] & m->addr32[2]) ==
 		     (b->addr32[2] & m->addr32[2])) &&
 		    ((a->addr32[3] & m->addr32[3]) ==
 		     (b->addr32[3] & m->addr32[3])))
 			match++;
 		break;
 #endif /* INET6 */
 	}
 	if (match) {
 		if (n)
 			return (0);
 		else
 			return (1);
 	} else {
 		if (n)
 			return (1);
 		else
 			return (0);
 	}
 }
 
 /*
  * Return 1 if b <= a <= e, otherwise return 0.
  */
 int
 pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
     struct pf_addr *a, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) ||
 		    (ntohl(a->addr32[0]) > ntohl(e->addr32[0])))
 			return (0);
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6: {
 		int	i;
 
 		/* check a >= b */
 		for (i = 0; i < 4; ++i)
 			if (ntohl(a->addr32[i]) > ntohl(b->addr32[i]))
 				break;
 			else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i]))
 				return (0);
 		/* check a <= e */
 		for (i = 0; i < 4; ++i)
 			if (ntohl(a->addr32[i]) < ntohl(e->addr32[i]))
 				break;
 			else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i]))
 				return (0);
 		break;
 	}
 #endif /* INET6 */
 	}
 	return (1);
 }
 
 static int
 pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
 {
 	switch (op) {
 	case PF_OP_IRG:
 		return ((p > a1) && (p < a2));
 	case PF_OP_XRG:
 		return ((p < a1) || (p > a2));
 	case PF_OP_RRG:
 		return ((p >= a1) && (p <= a2));
 	case PF_OP_EQ:
 		return (p == a1);
 	case PF_OP_NE:
 		return (p != a1);
 	case PF_OP_LT:
 		return (p < a1);
 	case PF_OP_LE:
 		return (p <= a1);
 	case PF_OP_GT:
 		return (p > a1);
 	case PF_OP_GE:
 		return (p >= a1);
 	}
 	return (0); /* never reached */
 }
 
 int
 pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
 {
 	NTOHS(a1);
 	NTOHS(a2);
 	NTOHS(p);
 	return (pf_match(op, a1, a2, p));
 }
 
 static int
 pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
 {
 	if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
 		return (0);
 	return (pf_match(op, a1, a2, u));
 }
 
 static int
 pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
 {
 	if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
 		return (0);
 	return (pf_match(op, a1, a2, g));
 }
 
 int
 pf_match_tag(struct mbuf *m, struct pf_krule *r, int *tag, int mtag)
 {
 	if (*tag == -1)
 		*tag = mtag;
 
 	return ((!r->match_tag_not && r->match_tag == *tag) ||
 	    (r->match_tag_not && r->match_tag != *tag));
 }
 
 int
 pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
 {
 
 	KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
 
 	if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
 		return (ENOMEM);
 
 	pd->pf_mtag->tag = tag;
 
 	return (0);
 }
 
 #define	PF_ANCHOR_STACKSIZE	32
 struct pf_kanchor_stackframe {
 	struct pf_kruleset	*rs;
 	struct pf_krule		*r;	/* XXX: + match bit */
 	struct pf_kanchor	*child;
 };
 
 /*
  * XXX: We rely on malloc(9) returning pointer aligned addresses.
  */
 #define	PF_ANCHORSTACK_MATCH	0x00000001
 #define	PF_ANCHORSTACK_MASK	(PF_ANCHORSTACK_MATCH)
 
 #define	PF_ANCHOR_MATCH(f)	((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
 #define	PF_ANCHOR_RULE(f)	(struct pf_krule *)			\
 				((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
 #define	PF_ANCHOR_SET_MATCH(f)	do { (f)->r = (void *) 			\
 				((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
 } while (0)
 
 void
 pf_step_into_anchor(struct pf_kanchor_stackframe *stack, int *depth,
     struct pf_kruleset **rs, int n, struct pf_krule **r, struct pf_krule **a,
     int *match)
 {
 	struct pf_kanchor_stackframe	*f;
 
 	PF_RULES_RASSERT();
 
 	if (match)
 		*match = 0;
 	if (*depth >= PF_ANCHOR_STACKSIZE) {
 		printf("%s: anchor stack overflow on %s\n",
 		    __func__, (*r)->anchor->name);
 		*r = TAILQ_NEXT(*r, entries);
 		return;
 	} else if (*depth == 0 && a != NULL)
 		*a = *r;
 	f = stack + (*depth)++;
 	f->rs = *rs;
 	f->r = *r;
 	if ((*r)->anchor_wildcard) {
 		struct pf_kanchor_node *parent = &(*r)->anchor->children;
 
 		if ((f->child = RB_MIN(pf_kanchor_node, parent)) == NULL) {
 			*r = NULL;
 			return;
 		}
 		*rs = &f->child->ruleset;
 	} else {
 		f->child = NULL;
 		*rs = &(*r)->anchor->ruleset;
 	}
 	*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
 }
 
 int
 pf_step_out_of_anchor(struct pf_kanchor_stackframe *stack, int *depth,
     struct pf_kruleset **rs, int n, struct pf_krule **r, struct pf_krule **a,
     int *match)
 {
 	struct pf_kanchor_stackframe	*f;
 	struct pf_krule *fr;
 	int quick = 0;
 
 	PF_RULES_RASSERT();
 
 	do {
 		if (*depth <= 0)
 			break;
 		f = stack + *depth - 1;
 		fr = PF_ANCHOR_RULE(f);
 		if (f->child != NULL) {
 			/*
 			 * This block traverses through
 			 * a wildcard anchor.
 			 */
 			if (match != NULL && *match) {
 				/*
 				 * If any of "*" matched, then
 				 * "foo/ *" matched, mark frame
 				 * appropriately.
 				 */
 				PF_ANCHOR_SET_MATCH(f);
 				*match = 0;
 			}
 			f->child = RB_NEXT(pf_kanchor_node,
 			    &fr->anchor->children, f->child);
 			if (f->child != NULL) {
 				*rs = &f->child->ruleset;
 				*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
 				if (*r == NULL)
 					continue;
 				else
 					break;
 			}
 		}
 		(*depth)--;
 		if (*depth == 0 && a != NULL)
 			*a = NULL;
 		*rs = f->rs;
 		if (PF_ANCHOR_MATCH(f) || (match != NULL && *match))
 			quick = fr->quick;
 		*r = TAILQ_NEXT(fr, entries);
 	} while (*r == NULL);
 
 	return (quick);
 }
 
 struct pf_keth_anchor_stackframe {
 	struct pf_keth_ruleset	*rs;
 	struct pf_keth_rule	*r;	/* XXX: + match bit */
 	struct pf_keth_anchor	*child;
 };
 
 #define	PF_ETH_ANCHOR_MATCH(f)	((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
 #define	PF_ETH_ANCHOR_RULE(f)	(struct pf_keth_rule *)			\
 				((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
 #define	PF_ETH_ANCHOR_SET_MATCH(f)	do { (f)->r = (void *) 		\
 				((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
 } while (0)
 
 void
 pf_step_into_keth_anchor(struct pf_keth_anchor_stackframe *stack, int *depth,
     struct pf_keth_ruleset **rs, struct pf_keth_rule **r,
     struct pf_keth_rule **a, int *match)
 {
 	struct pf_keth_anchor_stackframe	*f;
 
 	NET_EPOCH_ASSERT();
 
 	if (match)
 		*match = 0;
 	if (*depth >= PF_ANCHOR_STACKSIZE) {
 		printf("%s: anchor stack overflow on %s\n",
 		    __func__, (*r)->anchor->name);
 		*r = TAILQ_NEXT(*r, entries);
 		return;
 	} else if (*depth == 0 && a != NULL)
 		*a = *r;
 	f = stack + (*depth)++;
 	f->rs = *rs;
 	f->r = *r;
 	if ((*r)->anchor_wildcard) {
 		struct pf_keth_anchor_node *parent = &(*r)->anchor->children;
 
 		if ((f->child = RB_MIN(pf_keth_anchor_node, parent)) == NULL) {
 			*r = NULL;
 			return;
 		}
 		*rs = &f->child->ruleset;
 	} else {
 		f->child = NULL;
 		*rs = &(*r)->anchor->ruleset;
 	}
 	*r = TAILQ_FIRST((*rs)->active.rules);
 }
 
 int
 pf_step_out_of_keth_anchor(struct pf_keth_anchor_stackframe *stack, int *depth,
     struct pf_keth_ruleset **rs, struct pf_keth_rule **r,
     struct pf_keth_rule **a, int *match)
 {
 	struct pf_keth_anchor_stackframe	*f;
 	struct pf_keth_rule *fr;
 	int quick = 0;
 
 	NET_EPOCH_ASSERT();
 
 	do {
 		if (*depth <= 0)
 			break;
 		f = stack + *depth - 1;
 		fr = PF_ETH_ANCHOR_RULE(f);
 		if (f->child != NULL) {
 			/*
 			 * This block traverses through
 			 * a wildcard anchor.
 			 */
 			if (match != NULL && *match) {
 				/*
 				 * If any of "*" matched, then
 				 * "foo/ *" matched, mark frame
 				 * appropriately.
 				 */
 				PF_ETH_ANCHOR_SET_MATCH(f);
 				*match = 0;
 			}
 			f->child = RB_NEXT(pf_keth_anchor_node,
 			    &fr->anchor->children, f->child);
 			if (f->child != NULL) {
 				*rs = &f->child->ruleset;
 				*r = TAILQ_FIRST((*rs)->active.rules);
 				if (*r == NULL)
 					continue;
 				else
 					break;
 			}
 		}
 		(*depth)--;
 		if (*depth == 0 && a != NULL)
 			*a = NULL;
 		*rs = f->rs;
 		if (PF_ETH_ANCHOR_MATCH(f) || (match != NULL && *match))
 			quick = fr->quick;
 		*r = TAILQ_NEXT(fr, entries);
 	} while (*r == NULL);
 
 	return (quick);
 }
 
 #ifdef INET6
 void
 pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
     struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
 		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
 		break;
 #endif /* INET */
 	case AF_INET6:
 		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
 		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
 		naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
 		((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
 		naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
 		((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
 		naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
 		((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
 		break;
 	}
 }
 
 void
 pf_addr_inc(struct pf_addr *addr, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
 		break;
 #endif /* INET */
 	case AF_INET6:
 		if (addr->addr32[3] == 0xffffffff) {
 			addr->addr32[3] = 0;
 			if (addr->addr32[2] == 0xffffffff) {
 				addr->addr32[2] = 0;
 				if (addr->addr32[1] == 0xffffffff) {
 					addr->addr32[1] = 0;
 					addr->addr32[0] =
 					    htonl(ntohl(addr->addr32[0]) + 1);
 				} else
 					addr->addr32[1] =
 					    htonl(ntohl(addr->addr32[1]) + 1);
 			} else
 				addr->addr32[2] =
 				    htonl(ntohl(addr->addr32[2]) + 1);
 		} else
 			addr->addr32[3] =
 			    htonl(ntohl(addr->addr32[3]) + 1);
 		break;
 	}
 }
 #endif /* INET6 */
 
 void
 pf_rule_to_actions(struct pf_krule *r, struct pf_rule_actions *a)
 {
 	if (r->qid)
 		a->qid = r->qid;
 	if (r->pqid)
 		a->pqid = r->pqid;
 	if (r->dnpipe)
 		a->dnpipe = r->dnpipe;
 	if (r->dnrpipe)
 		a->dnrpipe = r->dnrpipe;
 	if (r->dnpipe || r->dnrpipe) {
 		if (r->free_flags & PFRULE_DN_IS_PIPE)
 			a->flags |= PFRULE_DN_IS_PIPE;
 		else
 			a->flags &= ~PFRULE_DN_IS_PIPE;
 	}
 }
 
 int
 pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m)
 {
 	struct pf_addr		*saddr, *daddr;
 	u_int16_t		 sport, dport;
 	struct inpcbinfo	*pi;
 	struct inpcb		*inp;
 
 	pd->lookup.uid = UID_MAX;
 	pd->lookup.gid = GID_MAX;
 
 	switch (pd->proto) {
 	case IPPROTO_TCP:
 		sport = pd->hdr.tcp.th_sport;
 		dport = pd->hdr.tcp.th_dport;
 		pi = &V_tcbinfo;
 		break;
 	case IPPROTO_UDP:
 		sport = pd->hdr.udp.uh_sport;
 		dport = pd->hdr.udp.uh_dport;
 		pi = &V_udbinfo;
 		break;
 	default:
 		return (-1);
 	}
 	if (direction == PF_IN) {
 		saddr = pd->src;
 		daddr = pd->dst;
 	} else {
 		u_int16_t	p;
 
 		p = sport;
 		sport = dport;
 		dport = p;
 		saddr = pd->dst;
 		daddr = pd->src;
 	}
 	switch (pd->af) {
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
 		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
 		if (inp == NULL) {
 			inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
 			   daddr->v4, dport, INPLOOKUP_WILDCARD |
 			   INPLOOKUP_RLOCKPCB, NULL, m);
 			if (inp == NULL)
 				return (-1);
 		}
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
 		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
 		if (inp == NULL) {
 			inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
 			    &daddr->v6, dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_RLOCKPCB, NULL, m);
 			if (inp == NULL)
 				return (-1);
 		}
 		break;
 #endif /* INET6 */
 
 	default:
 		return (-1);
 	}
 	INP_RLOCK_ASSERT(inp);
 	pd->lookup.uid = inp->inp_cred->cr_uid;
 	pd->lookup.gid = inp->inp_cred->cr_groups[0];
 	INP_RUNLOCK(inp);
 
 	return (1);
 }
 
 u_int8_t
 pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
 {
 	int		 hlen;
 	u_int8_t	 hdr[60];
 	u_int8_t	*opt, optlen;
 	u_int8_t	 wscale = 0;
 
 	hlen = th_off << 2;		/* hlen <= sizeof(hdr) */
 	if (hlen <= sizeof(struct tcphdr))
 		return (0);
 	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
 		return (0);
 	opt = hdr + sizeof(struct tcphdr);
 	hlen -= sizeof(struct tcphdr);
 	while (hlen >= 3) {
 		switch (*opt) {
 		case TCPOPT_EOL:
 		case TCPOPT_NOP:
 			++opt;
 			--hlen;
 			break;
 		case TCPOPT_WINDOW:
 			wscale = opt[2];
 			if (wscale > TCP_MAX_WINSHIFT)
 				wscale = TCP_MAX_WINSHIFT;
 			wscale |= PF_WSCALE_FLAG;
 			/* FALLTHROUGH */
 		default:
 			optlen = opt[1];
 			if (optlen < 2)
 				optlen = 2;
 			hlen -= optlen;
 			opt += optlen;
 			break;
 		}
 	}
 	return (wscale);
 }
 
 u_int16_t
 pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
 {
 	int		 hlen;
 	u_int8_t	 hdr[60];
 	u_int8_t	*opt, optlen;
 	u_int16_t	 mss = V_tcp_mssdflt;
 
 	hlen = th_off << 2;	/* hlen <= sizeof(hdr) */
 	if (hlen <= sizeof(struct tcphdr))
 		return (0);
 	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
 		return (0);
 	opt = hdr + sizeof(struct tcphdr);
 	hlen -= sizeof(struct tcphdr);
 	while (hlen >= TCPOLEN_MAXSEG) {
 		switch (*opt) {
 		case TCPOPT_EOL:
 		case TCPOPT_NOP:
 			++opt;
 			--hlen;
 			break;
 		case TCPOPT_MAXSEG:
 			bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
 			NTOHS(mss);
 			/* FALLTHROUGH */
 		default:
 			optlen = opt[1];
 			if (optlen < 2)
 				optlen = 2;
 			hlen -= optlen;
 			opt += optlen;
 			break;
 		}
 	}
 	return (mss);
 }
 
 static u_int16_t
 pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
 {
 	struct nhop_object *nh;
 #ifdef INET6
 	struct in6_addr		dst6;
 	uint32_t		scopeid;
 #endif /* INET6 */
 	int			 hlen = 0;
 	uint16_t		 mss = 0;
 
 	NET_EPOCH_ASSERT();
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		hlen = sizeof(struct ip);
 		nh = fib4_lookup(rtableid, addr->v4, 0, 0, 0);
 		if (nh != NULL)
 			mss = nh->nh_mtu - hlen - sizeof(struct tcphdr);
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		hlen = sizeof(struct ip6_hdr);
 		in6_splitscope(&addr->v6, &dst6, &scopeid);
 		nh = fib6_lookup(rtableid, &dst6, scopeid, 0, 0);
 		if (nh != NULL)
 			mss = nh->nh_mtu - hlen - sizeof(struct tcphdr);
 		break;
 #endif /* INET6 */
 	}
 
 	mss = max(V_tcp_mssdflt, mss);
 	mss = min(mss, offer);
 	mss = max(mss, 64);		/* sanity - at least max opt space */
 	return (mss);
 }
 
 static u_int32_t
 pf_tcp_iss(struct pf_pdesc *pd)
 {
 	MD5_CTX ctx;
 	u_int32_t digest[4];
 
 	if (V_pf_tcp_secret_init == 0) {
 		arc4random_buf(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
 		MD5Init(&V_pf_tcp_secret_ctx);
 		MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
 		    sizeof(V_pf_tcp_secret));
 		V_pf_tcp_secret_init = 1;
 	}
 
 	ctx = V_pf_tcp_secret_ctx;
 
 	MD5Update(&ctx, (char *)&pd->hdr.tcp.th_sport, sizeof(u_short));
 	MD5Update(&ctx, (char *)&pd->hdr.tcp.th_dport, sizeof(u_short));
 	if (pd->af == AF_INET6) {
 		MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
 		MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
 	} else {
 		MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
 		MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
 	}
 	MD5Final((u_char *)digest, &ctx);
 	V_pf_tcp_iss_off += 4096;
 #define	ISN_RANDOM_INCREMENT (4096 - 1)
 	return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
 	    V_pf_tcp_iss_off);
 #undef	ISN_RANDOM_INCREMENT
 }
 
 static bool
 pf_match_eth_addr(const uint8_t *a, const struct pf_keth_rule_addr *r)
 {
 	bool match = true;
 
 	/* Always matches if not set */
 	if (! r->isset)
 		return (!r->neg);
 
 	for (int i = 0; i < ETHER_ADDR_LEN; i++) {
 		if ((a[i] & r->mask[i]) != (r->addr[i] & r->mask[i])) {
 			match = false;
 			break;
 		}
 	}
 
 	return (match ^ r->neg);
 }
 
 static int
 pf_match_eth_tag(struct mbuf *m, struct pf_keth_rule *r, int *tag, int mtag)
 {
 	if (*tag == -1)
 		*tag = mtag;
 
 	return ((!r->match_tag_not && r->match_tag == *tag) ||
 	    (r->match_tag_not && r->match_tag != *tag));
 }
 
 static void
 pf_bridge_to(struct pfi_kkif *kif, struct mbuf *m)
 {
 	struct ifnet *ifp = kif->pfik_ifp;
 
 	/* If we don't have the interface drop the packet. */
 	if (ifp == NULL) {
 		m_freem(m);
 		return;
 	}
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_XETHER:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_IEEE8023ADLAG:
 		break;
 	default:
 		m_freem(m);
 		return;
 	}
 
 	kif->pfik_ifp->if_transmit(kif->pfik_ifp, m);
 }
 
 static int
 pf_test_eth_rule(int dir, struct pfi_kkif *kif, struct mbuf **m0)
 {
 #ifdef INET
 	struct ip ip;
 #endif
 #ifdef INET6
 	struct ip6_hdr ip6;
 #endif
 	struct mbuf *m = *m0;
 	struct ether_header *e;
 	struct pf_keth_rule *r, *rm, *a = NULL;
 	struct pf_keth_ruleset *ruleset = NULL;
 	struct pf_mtag *mtag;
 	struct pf_keth_ruleq *rules;
 	struct pf_addr *src = NULL, *dst = NULL;
 	sa_family_t af = 0;
 	uint16_t proto;
 	int asd = 0, match = 0;
 	int tag = -1;
 	uint8_t action;
 	struct pf_keth_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
 
 	MPASS(kif->pfik_ifp->if_vnet == curvnet);
 	NET_EPOCH_ASSERT();
 
 	PF_RULES_RLOCK_TRACKER;
 
 	SDT_PROBE3(pf, eth, test_rule, entry, dir, kif->pfik_ifp, m);
 
 	mtag = pf_find_mtag(m);
 	if (mtag != NULL && mtag->flags & PF_TAG_DUMMYNET) {
 		/* Dummynet re-injects packets after they've
 		 * completed their delay. We've already
 		 * processed them, so pass unconditionally. */
 
 		/* But only once. We may see the packet multiple times (e.g.
 		 * PFIL_IN/PFIL_OUT). */
 		mtag->flags &= ~PF_TAG_DUMMYNET;
 
 		return (PF_PASS);
 	}
 
 	ruleset = V_pf_keth;
 	rules = ck_pr_load_ptr(&ruleset->active.rules);
 	r = TAILQ_FIRST(rules);
 	rm = NULL;
 
 	e = mtod(m, struct ether_header *);
 	proto = ntohs(e->ether_type);
 
 	switch (proto) {
 #ifdef INET
 	case ETHERTYPE_IP: {
 		if (m_length(m, NULL) < (sizeof(struct ether_header) +
 		    sizeof(ip)))
 			return (PF_DROP);
 
 		af = AF_INET;
 		m_copydata(m, sizeof(struct ether_header), sizeof(ip),
 		    (caddr_t)&ip);
 		src = (struct pf_addr *)&ip.ip_src;
 		dst = (struct pf_addr *)&ip.ip_dst;
 		break;
 	}
 #endif /* INET */
 #ifdef INET6
 	case ETHERTYPE_IPV6: {
 		if (m_length(m, NULL) < (sizeof(struct ether_header) +
 		    sizeof(ip6)))
 			return (PF_DROP);
 
 		af = AF_INET6;
 		m_copydata(m, sizeof(struct ether_header), sizeof(ip6),
 		    (caddr_t)&ip6);
 		src = (struct pf_addr *)&ip6.ip6_src;
 		dst = (struct pf_addr *)&ip6.ip6_dst;
 		break;
 	}
 #endif /* INET6 */
 	}
 
 	PF_RULES_RLOCK();
 
 	while (r != NULL) {
 		counter_u64_add(r->evaluations, 1);
 		SDT_PROBE2(pf, eth, test_rule, test, r->nr, r);
 
 		if (pfi_kkif_match(r->kif, kif) == r->ifnot) {
 			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
 			    "kif");
 			r = r->skip[PFE_SKIP_IFP].ptr;
 		}
 		else if (r->direction && r->direction != dir) {
 			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
 			    "dir");
 			r = r->skip[PFE_SKIP_DIR].ptr;
 		}
 		else if (r->proto && r->proto != proto) {
 			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
 			    "proto");
 			r = r->skip[PFE_SKIP_PROTO].ptr;
 		}
 		else if (! pf_match_eth_addr(e->ether_shost, &r->src)) {
 			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
 			    "src");
 			r = r->skip[PFE_SKIP_SRC_ADDR].ptr;
 		}
 		else if (! pf_match_eth_addr(e->ether_dhost, &r->dst)) {
 			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
 			    "dst");
 			r = TAILQ_NEXT(r, entries);
 		}
 		else if (src != NULL && PF_MISMATCHAW(&r->ipsrc.addr, src, af,
 		    r->ipsrc.neg, kif, M_GETFIB(m))) {
 			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
 			    "ip_src");
 			r = TAILQ_NEXT(r, entries);
 		}
 		else if (dst != NULL && PF_MISMATCHAW(&r->ipdst.addr, dst, af,
 		    r->ipdst.neg, kif, M_GETFIB(m))) {
 			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
 			    "ip_dst");
 			r = TAILQ_NEXT(r, entries);
 		}
 		else if (r->match_tag && !pf_match_eth_tag(m, r, &tag,
 		    mtag ? mtag->tag : 0)) {
 			SDT_PROBE3(pf, eth, test_rule, mismatch, r->nr, r,
 			    "match_tag");
 			r = TAILQ_NEXT(r, entries);
 		}
 		else {
 			if (r->tag)
 				tag = r->tag;
 			if (r->anchor == NULL) {
 				/* Rule matches */
 				rm = r;
 
 				SDT_PROBE2(pf, eth, test_rule, match, r->nr, r);
 
 				if (r->quick)
 					break;
 
 				r = TAILQ_NEXT(r, entries);
 			} else {
 				pf_step_into_keth_anchor(anchor_stack, &asd,
 				    &ruleset, &r, &a, &match);
 			}
 		}
 		if (r == NULL && pf_step_out_of_keth_anchor(anchor_stack, &asd,
 		    &ruleset, &r, &a, &match))
 			break;
 	}
 
 	r = rm;
 
 	SDT_PROBE2(pf, eth, test_rule, final_match, (r != NULL ? r->nr : -1), r);
 
 	/* Default to pass. */
 	if (r == NULL) {
 		PF_RULES_RUNLOCK();
 		return (PF_PASS);
 	}
 
 	/* Execute action. */
 	counter_u64_add(r->packets[dir == PF_OUT], 1);
 	counter_u64_add(r->bytes[dir == PF_OUT], m_length(m, NULL));
 	pf_update_timestamp(r);
 
 	/* Shortcut. Don't tag if we're just going to drop anyway. */
 	if (r->action == PF_DROP) {
 		PF_RULES_RUNLOCK();
 		return (PF_DROP);
 	}
 
 	if (tag > 0) {
 		if (mtag == NULL)
 			mtag = pf_get_mtag(m);
 		if (mtag == NULL) {
 			PF_RULES_RUNLOCK();
 			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
 			return (PF_DROP);
 		}
 		mtag->tag = tag;
 	}
 
 	if (r->qid != 0) {
 		if (mtag == NULL)
 			mtag = pf_get_mtag(m);
 		if (mtag == NULL) {
 			PF_RULES_RUNLOCK();
 			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
 			return (PF_DROP);
 		}
 		mtag->qid = r->qid;
 	}
 
 	/* Dummynet */
 	if (r->dnpipe) {
 		struct ip_fw_args dnflow;
 
 		/* Drop packet if dummynet is not loaded. */
 		if (ip_dn_io_ptr == NULL) {
 			PF_RULES_RUNLOCK();
 			m_freem(m);
 			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
 			return (PF_DROP);
 		}
 		if (mtag == NULL)
 			mtag = pf_get_mtag(m);
 		if (mtag == NULL) {
 			PF_RULES_RUNLOCK();
 			counter_u64_add(V_pf_status.counters[PFRES_MEMORY], 1);
 			return (PF_DROP);
 		}
 
 		bzero(&dnflow, sizeof(dnflow));
 
 		/* We don't have port numbers here, so we set 0.  That means
 		 * that we'll be somewhat limited in distinguishing flows (i.e.
 		 * only based on IP addresses, not based on port numbers), but
 		 * it's better than nothing. */
 		dnflow.f_id.dst_port = 0;
 		dnflow.f_id.src_port = 0;
 		dnflow.f_id.proto = 0;
 
 		dnflow.rule.info = r->dnpipe;
 		dnflow.rule.info |= IPFW_IS_DUMMYNET;
 		if (r->dnflags & PFRULE_DN_IS_PIPE)
 			dnflow.rule.info |= IPFW_IS_PIPE;
 
 		dnflow.f_id.extra = dnflow.rule.info;
 
 		dnflow.flags = dir == PF_IN ? IPFW_ARGS_IN : IPFW_ARGS_OUT;
 		dnflow.flags |= IPFW_ARGS_ETHER;
 		dnflow.ifp = kif->pfik_ifp;
 
 		switch (af) {
 		case AF_INET:
 			dnflow.f_id.addr_type = 4;
 			dnflow.f_id.src_ip = src->v4.s_addr;
 			dnflow.f_id.dst_ip = dst->v4.s_addr;
 			break;
 		case AF_INET6:
 			dnflow.flags |= IPFW_ARGS_IP6;
 			dnflow.f_id.addr_type = 6;
 			dnflow.f_id.src_ip6 = src->v6;
 			dnflow.f_id.dst_ip6 = dst->v6;
 			break;
 		}
 
 		mtag->flags |= PF_TAG_DUMMYNET;
 		ip_dn_io_ptr(m0, &dnflow);
 		if (*m0 != NULL)
 			mtag->flags &= ~PF_TAG_DUMMYNET;
 	}
 
 	action = r->action;
 
 	if (action == PF_PASS && r->bridge_to) {
 		pf_bridge_to(r->bridge_to, *m0);
 		*m0 = NULL; /* We've eaten the packet. */
 	}
 
 	PF_RULES_RUNLOCK();
 
 	return (action);
 }
 
 static int
 pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, int direction,
     struct pfi_kkif *kif, struct mbuf *m, int off, struct pf_pdesc *pd,
     struct pf_krule **am, struct pf_kruleset **rsm, struct inpcb *inp)
 {
 	struct pf_krule		*nr = NULL;
 	struct pf_addr		* const saddr = pd->src;
 	struct pf_addr		* const daddr = pd->dst;
 	sa_family_t		 af = pd->af;
 	struct pf_krule		*r, *a = NULL;
 	struct pf_kruleset	*ruleset = NULL;
 	struct pf_ksrc_node	*nsn = NULL;
 	struct tcphdr		*th = &pd->hdr.tcp;
 	struct pf_state_key	*sk = NULL, *nk = NULL;
 	u_short			 reason;
 	int			 rewrite = 0, hdrlen = 0;
 	int			 tag = -1, rtableid = -1;
 	int			 asd = 0;
 	int			 match = 0;
 	int			 state_icmp = 0;
 	u_int16_t		 sport = 0, dport = 0;
 	u_int16_t		 bproto_sum = 0, bip_sum = 0;
 	u_int8_t		 icmptype = 0, icmpcode = 0;
 	struct pf_kanchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
 
 	PF_RULES_RASSERT();
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		pd->lookup.uid = inp->inp_cred->cr_uid;
 		pd->lookup.gid = inp->inp_cred->cr_groups[0];
 		pd->lookup.done = 1;
 	}
 
 	switch (pd->proto) {
 	case IPPROTO_TCP:
 		sport = th->th_sport;
 		dport = th->th_dport;
 		hdrlen = sizeof(*th);
 		break;
 	case IPPROTO_UDP:
 		sport = pd->hdr.udp.uh_sport;
 		dport = pd->hdr.udp.uh_dport;
 		hdrlen = sizeof(pd->hdr.udp);
 		break;
 #ifdef INET
 	case IPPROTO_ICMP:
 		if (pd->af != AF_INET)
 			break;
 		sport = dport = pd->hdr.icmp.icmp_id;
 		hdrlen = sizeof(pd->hdr.icmp);
 		icmptype = pd->hdr.icmp.icmp_type;
 		icmpcode = pd->hdr.icmp.icmp_code;
 
 		if (icmptype == ICMP_UNREACH ||
 		    icmptype == ICMP_SOURCEQUENCH ||
 		    icmptype == ICMP_REDIRECT ||
 		    icmptype == ICMP_TIMXCEED ||
 		    icmptype == ICMP_PARAMPROB)
 			state_icmp++;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case IPPROTO_ICMPV6:
 		if (af != AF_INET6)
 			break;
 		sport = dport = pd->hdr.icmp6.icmp6_id;
 		hdrlen = sizeof(pd->hdr.icmp6);
 		icmptype = pd->hdr.icmp6.icmp6_type;
 		icmpcode = pd->hdr.icmp6.icmp6_code;
 
 		if (icmptype == ICMP6_DST_UNREACH ||
 		    icmptype == ICMP6_PACKET_TOO_BIG ||
 		    icmptype == ICMP6_TIME_EXCEEDED ||
 		    icmptype == ICMP6_PARAM_PROB)
 			state_icmp++;
 		break;
 #endif /* INET6 */
 	default:
 		sport = dport = hdrlen = 0;
 		break;
 	}
 
 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
 
 	/* check packet for BINAT/NAT/RDR */
 	if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk,
 	    &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) {
 		KASSERT(sk != NULL, ("%s: null sk", __func__));
 		KASSERT(nk != NULL, ("%s: null nk", __func__));
 
 		if (nr->log) {
 			PFLOG_PACKET(kif, m, af, direction, PFRES_MATCH, nr, a,
 			    ruleset, pd, 1);
 		}
 
 		if (pd->ip_sum)
 			bip_sum = *pd->ip_sum;
 
 		switch (pd->proto) {
 		case IPPROTO_TCP:
 			bproto_sum = th->th_sum;
 			pd->proto_sum = &th->th_sum;
 
 			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
 			    nk->port[pd->sidx] != sport) {
 				pf_change_ap(m, saddr, &th->th_sport, pd->ip_sum,
 				    &th->th_sum, &nk->addr[pd->sidx],
 				    nk->port[pd->sidx], 0, af);
 				pd->sport = &th->th_sport;
 				sport = th->th_sport;
 			}
 
 			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
 			    nk->port[pd->didx] != dport) {
 				pf_change_ap(m, daddr, &th->th_dport, pd->ip_sum,
 				    &th->th_sum, &nk->addr[pd->didx],
 				    nk->port[pd->didx], 0, af);
 				dport = th->th_dport;
 				pd->dport = &th->th_dport;
 			}
 			rewrite++;
 			break;
 		case IPPROTO_UDP:
 			bproto_sum = pd->hdr.udp.uh_sum;
 			pd->proto_sum = &pd->hdr.udp.uh_sum;
 
 			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
 			    nk->port[pd->sidx] != sport) {
 				pf_change_ap(m, saddr, &pd->hdr.udp.uh_sport,
 				    pd->ip_sum, &pd->hdr.udp.uh_sum,
 				    &nk->addr[pd->sidx],
 				    nk->port[pd->sidx], 1, af);
 				sport = pd->hdr.udp.uh_sport;
 				pd->sport = &pd->hdr.udp.uh_sport;
 			}
 
 			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
 			    nk->port[pd->didx] != dport) {
 				pf_change_ap(m, daddr, &pd->hdr.udp.uh_dport,
 				    pd->ip_sum, &pd->hdr.udp.uh_sum,
 				    &nk->addr[pd->didx],
 				    nk->port[pd->didx], 1, af);
 				dport = pd->hdr.udp.uh_dport;
 				pd->dport = &pd->hdr.udp.uh_dport;
 			}
 			rewrite++;
 			break;
 #ifdef INET
 		case IPPROTO_ICMP:
 			nk->port[0] = nk->port[1];
 			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
 				pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
 				    nk->addr[pd->sidx].v4.s_addr, 0);
 
 			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
 				pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
 				    nk->addr[pd->didx].v4.s_addr, 0);
 
 			if (nk->port[1] != pd->hdr.icmp.icmp_id) {
 				pd->hdr.icmp.icmp_cksum = pf_cksum_fixup(
 				    pd->hdr.icmp.icmp_cksum, sport,
 				    nk->port[1], 0);
 				pd->hdr.icmp.icmp_id = nk->port[1];
 				pd->sport = &pd->hdr.icmp.icmp_id;
 			}
 			m_copyback(m, off, ICMP_MINLEN, (caddr_t)&pd->hdr.icmp);
 			break;
 #endif /* INET */
 #ifdef INET6
 		case IPPROTO_ICMPV6:
 			nk->port[0] = nk->port[1];
 			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
 				pf_change_a6(saddr, &pd->hdr.icmp6.icmp6_cksum,
 				    &nk->addr[pd->sidx], 0);
 
 			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
 				pf_change_a6(daddr, &pd->hdr.icmp6.icmp6_cksum,
 				    &nk->addr[pd->didx], 0);
 			rewrite++;
 			break;
 #endif /* INET */
 		default:
 			switch (af) {
 #ifdef INET
 			case AF_INET:
 				if (PF_ANEQ(saddr,
 				    &nk->addr[pd->sidx], AF_INET))
 					pf_change_a(&saddr->v4.s_addr,
 					    pd->ip_sum,
 					    nk->addr[pd->sidx].v4.s_addr, 0);
 
 				if (PF_ANEQ(daddr,
 				    &nk->addr[pd->didx], AF_INET))
 					pf_change_a(&daddr->v4.s_addr,
 					    pd->ip_sum,
 					    nk->addr[pd->didx].v4.s_addr, 0);
 				break;
 #endif /* INET */
 #ifdef INET6
 			case AF_INET6:
 				if (PF_ANEQ(saddr,
 				    &nk->addr[pd->sidx], AF_INET6))
 					PF_ACPY(saddr, &nk->addr[pd->sidx], af);
 
 				if (PF_ANEQ(daddr,
 				    &nk->addr[pd->didx], AF_INET6))
 					PF_ACPY(daddr, &nk->addr[pd->didx], af);
 				break;
 #endif /* INET */
 			}
 			break;
 		}
 		if (nr->natpass)
 			r = NULL;
 		pd->nat_rule = nr;
 	}
 
 	while (r != NULL) {
 		pf_counter_u64_add(&r->evaluations, 1);
 		if (pfi_kkif_match(r->kif, kif) == r->ifnot)
 			r = r->skip[PF_SKIP_IFP].ptr;
 		else if (r->direction && r->direction != direction)
 			r = r->skip[PF_SKIP_DIR].ptr;
 		else if (r->af && r->af != af)
 			r = r->skip[PF_SKIP_AF].ptr;
 		else if (r->proto && r->proto != pd->proto)
 			r = r->skip[PF_SKIP_PROTO].ptr;
 		else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
 		    r->src.neg, kif, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
 		/* tcp/udp only. port_op always 0 in other cases */
 		else if (r->src.port_op && !pf_match_port(r->src.port_op,
 		    r->src.port[0], r->src.port[1], sport))
 			r = r->skip[PF_SKIP_SRC_PORT].ptr;
 		else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
 		    r->dst.neg, NULL, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
 		/* tcp/udp only. port_op always 0 in other cases */
 		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
 		    r->dst.port[0], r->dst.port[1], dport))
 			r = r->skip[PF_SKIP_DST_PORT].ptr;
 		/* icmp only. type always 0 in other cases */
 		else if (r->type && r->type != icmptype + 1)
 			r = TAILQ_NEXT(r, entries);
 		/* icmp only. type always 0 in other cases */
 		else if (r->code && r->code != icmpcode + 1)
 			r = TAILQ_NEXT(r, entries);
 		else if (r->tos && !(r->tos == pd->tos))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->rule_flag & PFRULE_FRAGMENT)
 			r = TAILQ_NEXT(r, entries);
 		else if (pd->proto == IPPROTO_TCP &&
 		    (r->flagset & th->th_flags) != r->flags)
 			r = TAILQ_NEXT(r, entries);
 		/* tcp/udp only. uid.op always 0 in other cases */
 		else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
 		    pf_socket_lookup(direction, pd, m), 1)) &&
 		    !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
 		    pd->lookup.uid))
 			r = TAILQ_NEXT(r, entries);
 		/* tcp/udp only. gid.op always 0 in other cases */
 		else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
 		    pf_socket_lookup(direction, pd, m), 1)) &&
 		    !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
 		    pd->lookup.gid))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->prio &&
 		    !pf_match_ieee8021q_pcp(r->prio, m))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->prob &&
 		    r->prob <= arc4random())
 			r = TAILQ_NEXT(r, entries);
 		else if (r->match_tag && !pf_match_tag(m, r, &tag,
 		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->os_fingerprint != PF_OSFP_ANY &&
 		    (pd->proto != IPPROTO_TCP || !pf_osfp_match(
 		    pf_osfp_fingerprint(pd, m, off, th),
 		    r->os_fingerprint)))
 			r = TAILQ_NEXT(r, entries);
 		else {
 			if (r->tag)
 				tag = r->tag;
 			if (r->rtableid >= 0)
 				rtableid = r->rtableid;
 			if (r->anchor == NULL) {
 				if (r->action == PF_MATCH) {
 					pf_counter_u64_critical_enter();
 					pf_counter_u64_add_protected(&r->packets[direction == PF_OUT], 1);
 					pf_counter_u64_add_protected(&r->bytes[direction == PF_OUT], pd->tot_len);
 					pf_counter_u64_critical_exit();
 					pf_rule_to_actions(r, &pd->act);
 					if (r->log)
 						PFLOG_PACKET(kif, m, af,
 						    direction, PFRES_MATCH, r,
 						    a, ruleset, pd, 1);
 				} else {
 					match = 1;
 					*rm = r;
 					*am = a;
 					*rsm = ruleset;
 				}
 				if ((*rm)->quick)
 					break;
 				r = TAILQ_NEXT(r, entries);
 			} else
 				pf_step_into_anchor(anchor_stack, &asd,
 				    &ruleset, PF_RULESET_FILTER, &r, &a,
 				    &match);
 		}
 		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
 		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
 			break;
 	}
 	r = *rm;
 	a = *am;
 	ruleset = *rsm;
 
 	REASON_SET(&reason, PFRES_MATCH);
 
 	/* apply actions for last matching pass/block rule */
 	pf_rule_to_actions(r, &pd->act);
 
 	if (r->log) {
 		if (rewrite)
 			m_copyback(m, off, hdrlen, pd->hdr.any);
 		PFLOG_PACKET(kif, m, af, direction, reason, r, a,
 		    ruleset, pd, 1);
 	}
 
 	if ((r->action == PF_DROP) &&
 	    ((r->rule_flag & PFRULE_RETURNRST) ||
 	    (r->rule_flag & PFRULE_RETURNICMP) ||
 	    (r->rule_flag & PFRULE_RETURN))) {
 		pf_return(r, nr, pd, sk, off, m, th, kif, bproto_sum,
 		    bip_sum, hdrlen, &reason);
 	}
 
 	if (r->action == PF_DROP)
 		goto cleanup;
 
 	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
 		REASON_SET(&reason, PFRES_MEMORY);
 		goto cleanup;
 	}
 	if (rtableid >= 0)
 		M_SETFIB(m, rtableid);
 
 	if (!state_icmp && (r->keep_state || nr != NULL ||
 	    (pd->flags & PFDESC_TCP_NORM))) {
 		int action;
 		action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
 		    sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
 		    hdrlen);
 		if (action != PF_PASS) {
 			if (action == PF_DROP &&
 			    (r->rule_flag & PFRULE_RETURN))
 				pf_return(r, nr, pd, sk, off, m, th, kif,
 				    bproto_sum, bip_sum, hdrlen, &reason);
 			return (action);
 		}
 	} else {
 		if (sk != NULL)
 			uma_zfree(V_pf_state_key_z, sk);
 		if (nk != NULL)
 			uma_zfree(V_pf_state_key_z, nk);
 	}
 
 	/* copy back packet headers if we performed NAT operations */
 	if (rewrite)
 		m_copyback(m, off, hdrlen, pd->hdr.any);
 
 	if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
 	    direction == PF_OUT &&
 	    V_pfsync_defer_ptr != NULL && V_pfsync_defer_ptr(*sm, m))
 		/*
 		 * We want the state created, but we dont
 		 * want to send this in case a partner
 		 * firewall has to know about it to allow
 		 * replies through it.
 		 */
 		return (PF_DEFER);
 
 	return (PF_PASS);
 
 cleanup:
 	if (sk != NULL)
 		uma_zfree(V_pf_state_key_z, sk);
 	if (nk != NULL)
 		uma_zfree(V_pf_state_key_z, nk);
 	return (PF_DROP);
 }
 
 static int
 pf_create_state(struct pf_krule *r, struct pf_krule *nr, struct pf_krule *a,
     struct pf_pdesc *pd, struct pf_ksrc_node *nsn, struct pf_state_key *nk,
     struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
     u_int16_t dport, int *rewrite, struct pfi_kkif *kif, struct pf_kstate **sm,
     int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen)
 {
 	struct pf_kstate	*s = NULL;
 	struct pf_ksrc_node	*sn = NULL;
 	struct tcphdr		*th = &pd->hdr.tcp;
 	u_int16_t		 mss = V_tcp_mssdflt;
 	u_short			 reason;
 
 	/* check maximums */
 	if (r->max_states &&
 	    (counter_u64_fetch(r->states_cur) >= r->max_states)) {
 		counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1);
 		REASON_SET(&reason, PFRES_MAXSTATES);
 		goto csfailed;
 	}
 	/* src node for filter rule */
 	if ((r->rule_flag & PFRULE_SRCTRACK ||
 	    r->rpool.opts & PF_POOL_STICKYADDR) &&
 	    pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
 		REASON_SET(&reason, PFRES_SRCLIMIT);
 		goto csfailed;
 	}
 	/* src node for translation rule */
 	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
 	    pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
 		REASON_SET(&reason, PFRES_SRCLIMIT);
 		goto csfailed;
 	}
 	s = pf_alloc_state(M_NOWAIT);
 	if (s == NULL) {
 		REASON_SET(&reason, PFRES_MEMORY);
 		goto csfailed;
 	}
 	s->rule.ptr = r;
 	s->nat_rule.ptr = nr;
 	s->anchor.ptr = a;
 	STATE_INC_COUNTERS(s);
 	if (r->allow_opts)
 		s->state_flags |= PFSTATE_ALLOWOPTS;
 	if (r->rule_flag & PFRULE_STATESLOPPY)
 		s->state_flags |= PFSTATE_SLOPPY;
 	s->log = r->log & PF_LOG_ALL;
 	s->sync_state = PFSYNC_S_NONE;
 	s->qid = pd->act.qid;
 	s->pqid = pd->act.pqid;
 	s->dnpipe = pd->act.dnpipe;
 	s->dnrpipe = pd->act.dnrpipe;
 	s->state_flags |= pd->act.flags;
 	if (nr != NULL)
 		s->log |= nr->log & PF_LOG_ALL;
 	switch (pd->proto) {
 	case IPPROTO_TCP:
 		s->src.seqlo = ntohl(th->th_seq);
 		s->src.seqhi = s->src.seqlo + pd->p_len + 1;
 		if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
 		    r->keep_state == PF_STATE_MODULATE) {
 			/* Generate sequence number modulator */
 			if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
 			    0)
 				s->src.seqdiff = 1;
 			pf_change_proto_a(m, &th->th_seq, &th->th_sum,
 			    htonl(s->src.seqlo + s->src.seqdiff), 0);
 			*rewrite = 1;
 		} else
 			s->src.seqdiff = 0;
 		if (th->th_flags & TH_SYN) {
 			s->src.seqhi++;
 			s->src.wscale = pf_get_wscale(m, off,
 			    th->th_off, pd->af);
 		}
 		s->src.max_win = MAX(ntohs(th->th_win), 1);
 		if (s->src.wscale & PF_WSCALE_MASK) {
 			/* Remove scale factor from initial window */
 			int win = s->src.max_win;
 			win += 1 << (s->src.wscale & PF_WSCALE_MASK);
 			s->src.max_win = (win - 1) >>
 			    (s->src.wscale & PF_WSCALE_MASK);
 		}
 		if (th->th_flags & TH_FIN)
 			s->src.seqhi++;
 		s->dst.seqhi = 1;
 		s->dst.max_win = 1;
 		pf_set_protostate(s, PF_PEER_SRC, TCPS_SYN_SENT);
 		pf_set_protostate(s, PF_PEER_DST, TCPS_CLOSED);
 		s->timeout = PFTM_TCP_FIRST_PACKET;
 		atomic_add_32(&V_pf_status.states_halfopen, 1);
 		break;
 	case IPPROTO_UDP:
 		pf_set_protostate(s, PF_PEER_SRC, PFUDPS_SINGLE);
 		pf_set_protostate(s, PF_PEER_DST, PFUDPS_NO_TRAFFIC);
 		s->timeout = PFTM_UDP_FIRST_PACKET;
 		break;
 	case IPPROTO_ICMP:
 #ifdef INET6
 	case IPPROTO_ICMPV6:
 #endif
 		s->timeout = PFTM_ICMP_FIRST_PACKET;
 		break;
 	default:
 		pf_set_protostate(s, PF_PEER_SRC, PFOTHERS_SINGLE);
 		pf_set_protostate(s, PF_PEER_DST, PFOTHERS_NO_TRAFFIC);
 		s->timeout = PFTM_OTHER_FIRST_PACKET;
 	}
 
 	if (r->rt) {
 		if (pf_map_addr(pd->af, r, pd->src, &s->rt_addr, NULL, &sn)) {
 			REASON_SET(&reason, PFRES_MAPFAILED);
 			pf_src_tree_remove_state(s);
 			s->timeout = PFTM_UNLINKED;
 			STATE_DEC_COUNTERS(s);
 			pf_free_state(s);
 			goto csfailed;
 		}
 		s->rt_kif = r->rpool.cur->kif;
 	}
 
 	s->creation = time_uptime;
 	s->expire = time_uptime;
 
 	if (sn != NULL)
 		s->src_node = sn;
 	if (nsn != NULL) {
 		/* XXX We only modify one side for now. */
 		PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
 		s->nat_src_node = nsn;
 	}
 	if (pd->proto == IPPROTO_TCP) {
 		if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
 		    off, pd, th, &s->src, &s->dst)) {
 			REASON_SET(&reason, PFRES_MEMORY);
 			pf_src_tree_remove_state(s);
 			s->timeout = PFTM_UNLINKED;
 			STATE_DEC_COUNTERS(s);
 			pf_free_state(s);
 			return (PF_DROP);
 		}
 		if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
 		    pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
 		    &s->src, &s->dst, rewrite)) {
 			/* This really shouldn't happen!!! */
 			DPFPRINTF(PF_DEBUG_URGENT,
 			    ("pf_normalize_tcp_stateful failed on first "
 			     "pkt\n"));
 			pf_src_tree_remove_state(s);
 			s->timeout = PFTM_UNLINKED;
 			STATE_DEC_COUNTERS(s);
 			pf_free_state(s);
 			return (PF_DROP);
 		}
 	}
 	s->direction = pd->dir;
 
 	/*
 	 * sk/nk could already been setup by pf_get_translation().
 	 */
 	if (nr == NULL) {
 		KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
 		    __func__, nr, sk, nk));
 		sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
 		if (sk == NULL)
 			goto csfailed;
 		nk = sk;
 	} else
 		KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
 		    __func__, nr, sk, nk));
 
 	/* Swap sk/nk for PF_OUT. */
 	if (pf_state_insert(BOUND_IFACE(r, kif), kif,
 	    (pd->dir == PF_IN) ? sk : nk,
 	    (pd->dir == PF_IN) ? nk : sk, s)) {
 		REASON_SET(&reason, PFRES_STATEINS);
 		pf_src_tree_remove_state(s);
 		s->timeout = PFTM_UNLINKED;
 		STATE_DEC_COUNTERS(s);
 		pf_free_state(s);
 		return (PF_DROP);
 	} else
 		*sm = s;
 
 	if (tag > 0)
 		s->tag = tag;
 	if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
 	    TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
 		pf_set_protostate(s, PF_PEER_SRC, PF_TCPS_PROXY_SRC);
 		/* undo NAT changes, if they have taken place */
 		if (nr != NULL) {
 			struct pf_state_key *skt = s->key[PF_SK_WIRE];
 			if (pd->dir == PF_OUT)
 				skt = s->key[PF_SK_STACK];
 			PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
 			PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
 			if (pd->sport)
 				*pd->sport = skt->port[pd->sidx];
 			if (pd->dport)
 				*pd->dport = skt->port[pd->didx];
 			if (pd->proto_sum)
 				*pd->proto_sum = bproto_sum;
 			if (pd->ip_sum)
 				*pd->ip_sum = bip_sum;
 			m_copyback(m, off, hdrlen, pd->hdr.any);
 		}
 		s->src.seqhi = htonl(arc4random());
 		/* Find mss option */
 		int rtid = M_GETFIB(m);
 		mss = pf_get_mss(m, off, th->th_off, pd->af);
 		mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
 		mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
 		s->src.mss = mss;
 		pf_send_tcp(r, pd->af, pd->dst, pd->src, th->th_dport,
 		    th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
 		    TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0);
 		REASON_SET(&reason, PFRES_SYNPROXY);
 		return (PF_SYNPROXY_DROP);
 	}
 
 	return (PF_PASS);
 
 csfailed:
 	if (sk != NULL)
 		uma_zfree(V_pf_state_key_z, sk);
 	if (nk != NULL)
 		uma_zfree(V_pf_state_key_z, nk);
 
 	if (sn != NULL) {
 		struct pf_srchash *sh;
 
 		sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
 		PF_HASHROW_LOCK(sh);
 		if (--sn->states == 0 && sn->expire == 0) {
 			pf_unlink_src_node(sn);
 			uma_zfree(V_pf_sources_z, sn);
 			counter_u64_add(
 			    V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
 		}
 		PF_HASHROW_UNLOCK(sh);
 	}
 
 	if (nsn != sn && nsn != NULL) {
 		struct pf_srchash *sh;
 
 		sh = &V_pf_srchash[pf_hashsrc(&nsn->addr, nsn->af)];
 		PF_HASHROW_LOCK(sh);
 		if (--nsn->states == 0 && nsn->expire == 0) {
 			pf_unlink_src_node(nsn);
 			uma_zfree(V_pf_sources_z, nsn);
 			counter_u64_add(
 			    V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
 		}
 		PF_HASHROW_UNLOCK(sh);
 	}
 
 	return (PF_DROP);
 }
 
 static int
 pf_test_fragment(struct pf_krule **rm, int direction, struct pfi_kkif *kif,
     struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_krule **am,
     struct pf_kruleset **rsm)
 {
 	struct pf_krule		*r, *a = NULL;
 	struct pf_kruleset	*ruleset = NULL;
 	sa_family_t		 af = pd->af;
 	u_short			 reason;
 	int			 tag = -1;
 	int			 asd = 0;
 	int			 match = 0;
 	struct pf_kanchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
 
 	PF_RULES_RASSERT();
 
 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
 	while (r != NULL) {
 		pf_counter_u64_add(&r->evaluations, 1);
 		if (pfi_kkif_match(r->kif, kif) == r->ifnot)
 			r = r->skip[PF_SKIP_IFP].ptr;
 		else if (r->direction && r->direction != direction)
 			r = r->skip[PF_SKIP_DIR].ptr;
 		else if (r->af && r->af != af)
 			r = r->skip[PF_SKIP_AF].ptr;
 		else if (r->proto && r->proto != pd->proto)
 			r = r->skip[PF_SKIP_PROTO].ptr;
 		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
 		    r->src.neg, kif, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
 		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
 		    r->dst.neg, NULL, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
 		else if (r->tos && !(r->tos == pd->tos))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->os_fingerprint != PF_OSFP_ANY)
 			r = TAILQ_NEXT(r, entries);
 		else if (pd->proto == IPPROTO_UDP &&
 		    (r->src.port_op || r->dst.port_op))
 			r = TAILQ_NEXT(r, entries);
 		else if (pd->proto == IPPROTO_TCP &&
 		    (r->src.port_op || r->dst.port_op || r->flagset))
 			r = TAILQ_NEXT(r, entries);
 		else if ((pd->proto == IPPROTO_ICMP ||
 		    pd->proto == IPPROTO_ICMPV6) &&
 		    (r->type || r->code))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->prio &&
 		    !pf_match_ieee8021q_pcp(r->prio, m))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->prob && r->prob <=
 		    (arc4random() % (UINT_MAX - 1) + 1))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->match_tag && !pf_match_tag(m, r, &tag,
 		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
 			r = TAILQ_NEXT(r, entries);
 		else {
 			if (r->anchor == NULL) {
 				if (r->action == PF_MATCH) {
 					pf_counter_u64_critical_enter();
 					pf_counter_u64_add_protected(&r->packets[direction == PF_OUT], 1);
 					pf_counter_u64_add_protected(&r->bytes[direction == PF_OUT], pd->tot_len);
 					pf_counter_u64_critical_exit();
 					pf_rule_to_actions(r, &pd->act);
 					if (r->log)
 						PFLOG_PACKET(kif, m, af,
 						    direction, PFRES_MATCH, r,
 						    a, ruleset, pd, 1);
 				} else {
 					match = 1;
 					*rm = r;
 					*am = a;
 					*rsm = ruleset;
 				}
 				if ((*rm)->quick)
 					break;
 				r = TAILQ_NEXT(r, entries);
 			} else
 				pf_step_into_anchor(anchor_stack, &asd,
 				    &ruleset, PF_RULESET_FILTER, &r, &a,
 				    &match);
 		}
 		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
 		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
 			break;
 	}
 	r = *rm;
 	a = *am;
 	ruleset = *rsm;
 
 	REASON_SET(&reason, PFRES_MATCH);
 
 	/* apply actions for last matching pass/block rule */
 	pf_rule_to_actions(r, &pd->act);
 
 	if (r->log)
 		PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd,
 		    1);
 
 	if (r->action != PF_PASS)
 		return (PF_DROP);
 
 	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
 		REASON_SET(&reason, PFRES_MEMORY);
 		return (PF_DROP);
 	}
 
 	return (PF_PASS);
 }
 
 static int
 pf_tcp_track_full(struct pf_kstate **state, struct pfi_kkif *kif,
     struct mbuf *m, int off, struct pf_pdesc *pd, u_short *reason,
     int *copyback)
 {
 	struct tcphdr		*th = &pd->hdr.tcp;
 	struct pf_state_peer	*src, *dst;
 	u_int16_t		 win = ntohs(th->th_win);
 	u_int32_t		 ack, end, seq, orig_seq;
 	u_int8_t		 sws, dws, psrc, pdst;
 	int			 ackskew;
 
 	if (pd->dir == (*state)->direction) {
 		src = &(*state)->src;
 		dst = &(*state)->dst;
 		psrc = PF_PEER_SRC;
 		pdst = PF_PEER_DST;
 	} else {
 		src = &(*state)->dst;
 		dst = &(*state)->src;
 		psrc = PF_PEER_DST;
 		pdst = PF_PEER_SRC;
 	}
 
 	if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
 		sws = src->wscale & PF_WSCALE_MASK;
 		dws = dst->wscale & PF_WSCALE_MASK;
 	} else
 		sws = dws = 0;
 
 	/*
 	 * Sequence tracking algorithm from Guido van Rooij's paper:
 	 *   http://www.madison-gurkha.com/publications/tcp_filtering/
 	 *	tcp_filtering.ps
 	 */
 
 	orig_seq = seq = ntohl(th->th_seq);
 	if (src->seqlo == 0) {
 		/* First packet from this end. Set its state */
 
 		if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
 		    src->scrub == NULL) {
 			if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
 				REASON_SET(reason, PFRES_MEMORY);
 				return (PF_DROP);
 			}
 		}
 
 		/* Deferred generation of sequence number modulator */
 		if (dst->seqdiff && !src->seqdiff) {
 			/* use random iss for the TCP server */
 			while ((src->seqdiff = arc4random() - seq) == 0)
 				;
 			ack = ntohl(th->th_ack) - dst->seqdiff;
 			pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
 			    src->seqdiff), 0);
 			pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
 			*copyback = 1;
 		} else {
 			ack = ntohl(th->th_ack);
 		}
 
 		end = seq + pd->p_len;
 		if (th->th_flags & TH_SYN) {
 			end++;
 			if (dst->wscale & PF_WSCALE_FLAG) {
 				src->wscale = pf_get_wscale(m, off, th->th_off,
 				    pd->af);
 				if (src->wscale & PF_WSCALE_FLAG) {
 					/* Remove scale factor from initial
 					 * window */
 					sws = src->wscale & PF_WSCALE_MASK;
 					win = ((u_int32_t)win + (1 << sws) - 1)
 					    >> sws;
 					dws = dst->wscale & PF_WSCALE_MASK;
 				} else {
 					/* fixup other window */
 					dst->max_win <<= dst->wscale &
 					    PF_WSCALE_MASK;
 					/* in case of a retrans SYN|ACK */
 					dst->wscale = 0;
 				}
 			}
 		}
 		if (th->th_flags & TH_FIN)
 			end++;
 
 		src->seqlo = seq;
 		if (src->state < TCPS_SYN_SENT)
 			pf_set_protostate(*state, psrc, TCPS_SYN_SENT);
 
 		/*
 		 * May need to slide the window (seqhi may have been set by
 		 * the crappy stack check or if we picked up the connection
 		 * after establishment)
 		 */
 		if (src->seqhi == 1 ||
 		    SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
 			src->seqhi = end + MAX(1, dst->max_win << dws);
 		if (win > src->max_win)
 			src->max_win = win;
 
 	} else {
 		ack = ntohl(th->th_ack) - dst->seqdiff;
 		if (src->seqdiff) {
 			/* Modulate sequence numbers */
 			pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
 			    src->seqdiff), 0);
 			pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
 			*copyback = 1;
 		}
 		end = seq + pd->p_len;
 		if (th->th_flags & TH_SYN)
 			end++;
 		if (th->th_flags & TH_FIN)
 			end++;
 	}
 
 	if ((th->th_flags & TH_ACK) == 0) {
 		/* Let it pass through the ack skew check */
 		ack = dst->seqlo;
 	} else if ((ack == 0 &&
 	    (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
 	    /* broken tcp stacks do not set ack */
 	    (dst->state < TCPS_SYN_SENT)) {
 		/*
 		 * Many stacks (ours included) will set the ACK number in an
 		 * FIN|ACK if the SYN times out -- no sequence to ACK.
 		 */
 		ack = dst->seqlo;
 	}
 
 	if (seq == end) {
 		/* Ease sequencing restrictions on no data packets */
 		seq = src->seqlo;
 		end = seq;
 	}
 
 	ackskew = dst->seqlo - ack;
 
 	/*
 	 * Need to demodulate the sequence numbers in any TCP SACK options
 	 * (Selective ACK). We could optionally validate the SACK values
 	 * against the current ACK window, either forwards or backwards, but
 	 * I'm not confident that SACK has been implemented properly
 	 * everywhere. It wouldn't surprise me if several stacks accidentally
 	 * SACK too far backwards of previously ACKed data. There really aren't
 	 * any security implications of bad SACKing unless the target stack
 	 * doesn't validate the option length correctly. Someone trying to
 	 * spoof into a TCP connection won't bother blindly sending SACK
 	 * options anyway.
 	 */
 	if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
 		if (pf_modulate_sack(m, off, pd, th, dst))
 			*copyback = 1;
 	}
 
 #define	MAXACKWINDOW (0xffff + 1500)	/* 1500 is an arbitrary fudge factor */
 	if (SEQ_GEQ(src->seqhi, end) &&
 	    /* Last octet inside other's window space */
 	    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
 	    /* Retrans: not more than one window back */
 	    (ackskew >= -MAXACKWINDOW) &&
 	    /* Acking not more than one reassembled fragment backwards */
 	    (ackskew <= (MAXACKWINDOW << sws)) &&
 	    /* Acking not more than one window forward */
 	    ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
 	    (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
 	    (pd->flags & PFDESC_IP_REAS) == 0)) {
 	    /* Require an exact/+1 sequence match on resets when possible */
 
 		if (dst->scrub || src->scrub) {
 			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
 			    *state, src, dst, copyback))
 				return (PF_DROP);
 		}
 
 		/* update max window */
 		if (src->max_win < win)
 			src->max_win = win;
 		/* synchronize sequencing */
 		if (SEQ_GT(end, src->seqlo))
 			src->seqlo = end;
 		/* slide the window of what the other end can send */
 		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
 			dst->seqhi = ack + MAX((win << sws), 1);
 
 		/* update states */
 		if (th->th_flags & TH_SYN)
 			if (src->state < TCPS_SYN_SENT)
 				pf_set_protostate(*state, psrc, TCPS_SYN_SENT);
 		if (th->th_flags & TH_FIN)
 			if (src->state < TCPS_CLOSING)
 				pf_set_protostate(*state, psrc, TCPS_CLOSING);
 		if (th->th_flags & TH_ACK) {
 			if (dst->state == TCPS_SYN_SENT) {
 				pf_set_protostate(*state, pdst,
 				    TCPS_ESTABLISHED);
 				if (src->state == TCPS_ESTABLISHED &&
 				    (*state)->src_node != NULL &&
 				    pf_src_connlimit(state)) {
 					REASON_SET(reason, PFRES_SRCLIMIT);
 					return (PF_DROP);
 				}
 			} else if (dst->state == TCPS_CLOSING)
 				pf_set_protostate(*state, pdst,
 				    TCPS_FIN_WAIT_2);
 		}
 		if (th->th_flags & TH_RST)
 			pf_set_protostate(*state, PF_PEER_BOTH, TCPS_TIME_WAIT);
 
 		/* update expire time */
 		(*state)->expire = time_uptime;
 		if (src->state >= TCPS_FIN_WAIT_2 &&
 		    dst->state >= TCPS_FIN_WAIT_2)
 			(*state)->timeout = PFTM_TCP_CLOSED;
 		else if (src->state >= TCPS_CLOSING &&
 		    dst->state >= TCPS_CLOSING)
 			(*state)->timeout = PFTM_TCP_FIN_WAIT;
 		else if (src->state < TCPS_ESTABLISHED ||
 		    dst->state < TCPS_ESTABLISHED)
 			(*state)->timeout = PFTM_TCP_OPENING;
 		else if (src->state >= TCPS_CLOSING ||
 		    dst->state >= TCPS_CLOSING)
 			(*state)->timeout = PFTM_TCP_CLOSING;
 		else
 			(*state)->timeout = PFTM_TCP_ESTABLISHED;
 
 		/* Fall through to PASS packet */
 
 	} else if ((dst->state < TCPS_SYN_SENT ||
 		dst->state >= TCPS_FIN_WAIT_2 ||
 		src->state >= TCPS_FIN_WAIT_2) &&
 	    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
 	    /* Within a window forward of the originating packet */
 	    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
 	    /* Within a window backward of the originating packet */
 
 		/*
 		 * This currently handles three situations:
 		 *  1) Stupid stacks will shotgun SYNs before their peer
 		 *     replies.
 		 *  2) When PF catches an already established stream (the
 		 *     firewall rebooted, the state table was flushed, routes
 		 *     changed...)
 		 *  3) Packets get funky immediately after the connection
 		 *     closes (this should catch Solaris spurious ACK|FINs
 		 *     that web servers like to spew after a close)
 		 *
 		 * This must be a little more careful than the above code
 		 * since packet floods will also be caught here. We don't
 		 * update the TTL here to mitigate the damage of a packet
 		 * flood and so the same code can handle awkward establishment
 		 * and a loosened connection close.
 		 * In the establishment case, a correct peer response will
 		 * validate the connection, go through the normal state code
 		 * and keep updating the state TTL.
 		 */
 
 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			printf("pf: loose state match: ");
 			pf_print_state(*state);
 			pf_print_flags(th->th_flags);
 			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
 			    "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
 			    pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
 			    (unsigned long long)(*state)->packets[1],
 			    pd->dir == PF_IN ? "in" : "out",
 			    pd->dir == (*state)->direction ? "fwd" : "rev");
 		}
 
 		if (dst->scrub || src->scrub) {
 			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
 			    *state, src, dst, copyback))
 				return (PF_DROP);
 		}
 
 		/* update max window */
 		if (src->max_win < win)
 			src->max_win = win;
 		/* synchronize sequencing */
 		if (SEQ_GT(end, src->seqlo))
 			src->seqlo = end;
 		/* slide the window of what the other end can send */
 		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
 			dst->seqhi = ack + MAX((win << sws), 1);
 
 		/*
 		 * Cannot set dst->seqhi here since this could be a shotgunned
 		 * SYN and not an already established connection.
 		 */
 
 		if (th->th_flags & TH_FIN)
 			if (src->state < TCPS_CLOSING)
 				pf_set_protostate(*state, psrc, TCPS_CLOSING);
 		if (th->th_flags & TH_RST)
 			pf_set_protostate(*state, PF_PEER_BOTH, TCPS_TIME_WAIT);
 
 		/* Fall through to PASS packet */
 
 	} else {
 		if ((*state)->dst.state == TCPS_SYN_SENT &&
 		    (*state)->src.state == TCPS_SYN_SENT) {
 			/* Send RST for state mismatches during handshake */
 			if (!(th->th_flags & TH_RST))
 				pf_send_tcp((*state)->rule.ptr, pd->af,
 				    pd->dst, pd->src, th->th_dport,
 				    th->th_sport, ntohl(th->th_ack), 0,
 				    TH_RST, 0, 0,
 				    (*state)->rule.ptr->return_ttl, 1, 0);
 			src->seqlo = 0;
 			src->seqhi = 1;
 			src->max_win = 1;
 		} else if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			printf("pf: BAD state: ");
 			pf_print_state(*state);
 			pf_print_flags(th->th_flags);
 			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
 			    "pkts=%llu:%llu dir=%s,%s\n",
 			    seq, orig_seq, ack, pd->p_len, ackskew,
 			    (unsigned long long)(*state)->packets[0],
 			    (unsigned long long)(*state)->packets[1],
 			    pd->dir == PF_IN ? "in" : "out",
 			    pd->dir == (*state)->direction ? "fwd" : "rev");
 			printf("pf: State failure on: %c %c %c %c | %c %c\n",
 			    SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
 			    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
 			    ' ': '2',
 			    (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
 			    (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
 			    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
 			    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
 		}
 		REASON_SET(reason, PFRES_BADSTATE);
 		return (PF_DROP);
 	}
 
 	return (PF_PASS);
 }
 
 static int
 pf_tcp_track_sloppy(struct pf_kstate **state, struct pf_pdesc *pd, u_short *reason)
 {
 	struct tcphdr		*th = &pd->hdr.tcp;
 	struct pf_state_peer	*src, *dst;
 	u_int8_t		 psrc, pdst;
 
 	if (pd->dir == (*state)->direction) {
 		src = &(*state)->src;
 		dst = &(*state)->dst;
 		psrc = PF_PEER_SRC;
 		pdst = PF_PEER_DST;
 	} else {
 		src = &(*state)->dst;
 		dst = &(*state)->src;
 		psrc = PF_PEER_DST;
 		pdst = PF_PEER_SRC;
 	}
 
 	if (th->th_flags & TH_SYN)
 		if (src->state < TCPS_SYN_SENT)
 			pf_set_protostate(*state, psrc, TCPS_SYN_SENT);
 	if (th->th_flags & TH_FIN)
 		if (src->state < TCPS_CLOSING)
 			pf_set_protostate(*state, psrc, TCPS_CLOSING);
 	if (th->th_flags & TH_ACK) {
 		if (dst->state == TCPS_SYN_SENT) {
 			pf_set_protostate(*state, pdst, TCPS_ESTABLISHED);
 			if (src->state == TCPS_ESTABLISHED &&
 			    (*state)->src_node != NULL &&
 			    pf_src_connlimit(state)) {
 				REASON_SET(reason, PFRES_SRCLIMIT);
 				return (PF_DROP);
 			}
 		} else if (dst->state == TCPS_CLOSING) {
 			pf_set_protostate(*state, pdst, TCPS_FIN_WAIT_2);
 		} else if (src->state == TCPS_SYN_SENT &&
 		    dst->state < TCPS_SYN_SENT) {
 			/*
 			 * Handle a special sloppy case where we only see one
 			 * half of the connection. If there is a ACK after
 			 * the initial SYN without ever seeing a packet from
 			 * the destination, set the connection to established.
 			 */
 			pf_set_protostate(*state, PF_PEER_BOTH,
 			    TCPS_ESTABLISHED);
 			dst->state = src->state = TCPS_ESTABLISHED;
 			if ((*state)->src_node != NULL &&
 			    pf_src_connlimit(state)) {
 				REASON_SET(reason, PFRES_SRCLIMIT);
 				return (PF_DROP);
 			}
 		} else if (src->state == TCPS_CLOSING &&
 		    dst->state == TCPS_ESTABLISHED &&
 		    dst->seqlo == 0) {
 			/*
 			 * Handle the closing of half connections where we
 			 * don't see the full bidirectional FIN/ACK+ACK
 			 * handshake.
 			 */
 			pf_set_protostate(*state, pdst, TCPS_CLOSING);
 		}
 	}
 	if (th->th_flags & TH_RST)
 		pf_set_protostate(*state, PF_PEER_BOTH, TCPS_TIME_WAIT);
 
 	/* update expire time */
 	(*state)->expire = time_uptime;
 	if (src->state >= TCPS_FIN_WAIT_2 &&
 	    dst->state >= TCPS_FIN_WAIT_2)
 		(*state)->timeout = PFTM_TCP_CLOSED;
 	else if (src->state >= TCPS_CLOSING &&
 	    dst->state >= TCPS_CLOSING)
 		(*state)->timeout = PFTM_TCP_FIN_WAIT;
 	else if (src->state < TCPS_ESTABLISHED ||
 	    dst->state < TCPS_ESTABLISHED)
 		(*state)->timeout = PFTM_TCP_OPENING;
 	else if (src->state >= TCPS_CLOSING ||
 	    dst->state >= TCPS_CLOSING)
 		(*state)->timeout = PFTM_TCP_CLOSING;
 	else
 		(*state)->timeout = PFTM_TCP_ESTABLISHED;
 
 	return (PF_PASS);
 }
 
 static int
 pf_synproxy(struct pf_pdesc *pd, struct pf_kstate **state, u_short *reason)
 {
 	struct pf_state_key	*sk = (*state)->key[pd->didx];
 	struct tcphdr		*th = &pd->hdr.tcp;
 
 	if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
 		if (pd->dir != (*state)->direction) {
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_SYNPROXY_DROP);
 		}
 		if (th->th_flags & TH_SYN) {
 			if (ntohl(th->th_seq) != (*state)->src.seqlo) {
 				REASON_SET(reason, PFRES_SYNPROXY);
 				return (PF_DROP);
 			}
 			pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
 			    pd->src, th->th_dport, th->th_sport,
 			    (*state)->src.seqhi, ntohl(th->th_seq) + 1,
 			    TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0);
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_SYNPROXY_DROP);
 		} else if ((th->th_flags & (TH_ACK|TH_RST|TH_FIN)) != TH_ACK ||
 		    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
 		    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_DROP);
 		} else if ((*state)->src_node != NULL &&
 		    pf_src_connlimit(state)) {
 			REASON_SET(reason, PFRES_SRCLIMIT);
 			return (PF_DROP);
 		} else
 			pf_set_protostate(*state, PF_PEER_SRC,
 			    PF_TCPS_PROXY_DST);
 	}
 	if ((*state)->src.state == PF_TCPS_PROXY_DST) {
 		if (pd->dir == (*state)->direction) {
 			if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
 			    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
 			    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
 				REASON_SET(reason, PFRES_SYNPROXY);
 				return (PF_DROP);
 			}
 			(*state)->src.max_win = MAX(ntohs(th->th_win), 1);
 			if ((*state)->dst.seqhi == 1)
 				(*state)->dst.seqhi = htonl(arc4random());
 			pf_send_tcp((*state)->rule.ptr, pd->af,
 			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
 			    sk->port[pd->sidx], sk->port[pd->didx],
 			    (*state)->dst.seqhi, 0, TH_SYN, 0,
 			    (*state)->src.mss, 0, 0, (*state)->tag);
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_SYNPROXY_DROP);
 		} else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
 		    (TH_SYN|TH_ACK)) ||
 		    (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_DROP);
 		} else {
 			(*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
 			(*state)->dst.seqlo = ntohl(th->th_seq);
 			pf_send_tcp((*state)->rule.ptr, pd->af, pd->dst,
 			    pd->src, th->th_dport, th->th_sport,
 			    ntohl(th->th_ack), ntohl(th->th_seq) + 1,
 			    TH_ACK, (*state)->src.max_win, 0, 0, 0,
 			    (*state)->tag);
 			pf_send_tcp((*state)->rule.ptr, pd->af,
 			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
 			    sk->port[pd->sidx], sk->port[pd->didx],
 			    (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
 			    TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0);
 			(*state)->src.seqdiff = (*state)->dst.seqhi -
 			    (*state)->src.seqlo;
 			(*state)->dst.seqdiff = (*state)->src.seqhi -
 			    (*state)->dst.seqlo;
 			(*state)->src.seqhi = (*state)->src.seqlo +
 			    (*state)->dst.max_win;
 			(*state)->dst.seqhi = (*state)->dst.seqlo +
 			    (*state)->src.max_win;
 			(*state)->src.wscale = (*state)->dst.wscale = 0;
 			pf_set_protostate(*state, PF_PEER_BOTH,
 			    TCPS_ESTABLISHED);
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_SYNPROXY_DROP);
 		}
 	}
 
 	return (PF_PASS);
 }
 
 static int
 pf_test_state_tcp(struct pf_kstate **state, int direction, struct pfi_kkif *kif,
     struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
     u_short *reason)
 {
 	struct pf_state_key_cmp	 key;
 	struct tcphdr		*th = &pd->hdr.tcp;
 	int			 copyback = 0;
 	int			 action;
 	struct pf_state_peer	*src, *dst;
 
 	bzero(&key, sizeof(key));
 	key.af = pd->af;
 	key.proto = IPPROTO_TCP;
 	if (direction == PF_IN)	{	/* wire side, straight */
 		PF_ACPY(&key.addr[0], pd->src, key.af);
 		PF_ACPY(&key.addr[1], pd->dst, key.af);
 		key.port[0] = th->th_sport;
 		key.port[1] = th->th_dport;
 	} else {			/* stack side, reverse */
 		PF_ACPY(&key.addr[1], pd->src, key.af);
 		PF_ACPY(&key.addr[0], pd->dst, key.af);
 		key.port[1] = th->th_sport;
 		key.port[0] = th->th_dport;
 	}
 
 	STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 	if (direction == (*state)->direction) {
 		src = &(*state)->src;
 		dst = &(*state)->dst;
 	} else {
 		src = &(*state)->dst;
 		dst = &(*state)->src;
 	}
 
 	if ((action = pf_synproxy(pd, state, reason)) != PF_PASS)
 		return (action);
 
 	if (dst->state >= TCPS_FIN_WAIT_2 &&
 	    src->state >= TCPS_FIN_WAIT_2 &&
 	    (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) ||
 	    ((th->th_flags & (TH_SYN|TH_ACK|TH_RST)) == TH_ACK &&
 	    pf_syncookie_check(pd) && pd->dir == PF_IN))) {
 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			printf("pf: state reuse ");
 			pf_print_state(*state);
 			pf_print_flags(th->th_flags);
 			printf("\n");
 		}
 		/* XXX make sure it's the same direction ?? */
 		pf_set_protostate(*state, PF_PEER_BOTH, TCPS_CLOSED);
 		pf_unlink_state(*state);
 		*state = NULL;
 		return (PF_DROP);
 	}
 
 	if ((*state)->state_flags & PFSTATE_SLOPPY) {
 		if (pf_tcp_track_sloppy(state, pd, reason) == PF_DROP)
 			return (PF_DROP);
 	} else {
 		if (pf_tcp_track_full(state, kif, m, off, pd, reason,
 		    &copyback) == PF_DROP)
 			return (PF_DROP);
 	}
 
 	/* translate source/destination address, if necessary */
 	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
 		struct pf_state_key *nk = (*state)->key[pd->didx];
 
 		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
 		    nk->port[pd->sidx] != th->th_sport)
 			pf_change_ap(m, pd->src, &th->th_sport,
 			    pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx],
 			    nk->port[pd->sidx], 0, pd->af);
 
 		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
 		    nk->port[pd->didx] != th->th_dport)
 			pf_change_ap(m, pd->dst, &th->th_dport,
 			    pd->ip_sum, &th->th_sum, &nk->addr[pd->didx],
 			    nk->port[pd->didx], 0, pd->af);
 		copyback = 1;
 	}
 
 	/* Copyback sequence modulation or stateful scrub changes if needed */
 	if (copyback)
 		m_copyback(m, off, sizeof(*th), (caddr_t)th);
 
 	return (PF_PASS);
 }
 
 static int
 pf_test_state_udp(struct pf_kstate **state, int direction, struct pfi_kkif *kif,
     struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
 {
 	struct pf_state_peer	*src, *dst;
 	struct pf_state_key_cmp	 key;
 	struct udphdr		*uh = &pd->hdr.udp;
 	uint8_t			 psrc, pdst;
 
 	bzero(&key, sizeof(key));
 	key.af = pd->af;
 	key.proto = IPPROTO_UDP;
 	if (direction == PF_IN)	{	/* wire side, straight */
 		PF_ACPY(&key.addr[0], pd->src, key.af);
 		PF_ACPY(&key.addr[1], pd->dst, key.af);
 		key.port[0] = uh->uh_sport;
 		key.port[1] = uh->uh_dport;
 	} else {			/* stack side, reverse */
 		PF_ACPY(&key.addr[1], pd->src, key.af);
 		PF_ACPY(&key.addr[0], pd->dst, key.af);
 		key.port[1] = uh->uh_sport;
 		key.port[0] = uh->uh_dport;
 	}
 
 	STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 	if (direction == (*state)->direction) {
 		src = &(*state)->src;
 		dst = &(*state)->dst;
 		psrc = PF_PEER_SRC;
 		pdst = PF_PEER_DST;
 	} else {
 		src = &(*state)->dst;
 		dst = &(*state)->src;
 		psrc = PF_PEER_DST;
 		pdst = PF_PEER_SRC;
 	}
 
 	/* update states */
 	if (src->state < PFUDPS_SINGLE)
 		pf_set_protostate(*state, psrc, PFUDPS_SINGLE);
 	if (dst->state == PFUDPS_SINGLE)
 		pf_set_protostate(*state, pdst, PFUDPS_MULTIPLE);
 
 	/* update expire time */
 	(*state)->expire = time_uptime;
 	if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
 		(*state)->timeout = PFTM_UDP_MULTIPLE;
 	else
 		(*state)->timeout = PFTM_UDP_SINGLE;
 
 	/* translate source/destination address, if necessary */
 	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
 		struct pf_state_key *nk = (*state)->key[pd->didx];
 
 		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
 		    nk->port[pd->sidx] != uh->uh_sport)
 			pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum,
 			    &uh->uh_sum, &nk->addr[pd->sidx],
 			    nk->port[pd->sidx], 1, pd->af);
 
 		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
 		    nk->port[pd->didx] != uh->uh_dport)
 			pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum,
 			    &uh->uh_sum, &nk->addr[pd->didx],
 			    nk->port[pd->didx], 1, pd->af);
 		m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
 	}
 
 	return (PF_PASS);
 }
 
 static int
 pf_test_state_icmp(struct pf_kstate **state, int direction, struct pfi_kkif *kif,
     struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
 {
 	struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
 	u_int16_t	 icmpid = 0, *icmpsum;
 	u_int8_t	 icmptype, icmpcode;
 	int		 state_icmp = 0;
 	struct pf_state_key_cmp key;
 
 	bzero(&key, sizeof(key));
 	switch (pd->proto) {
 #ifdef INET
 	case IPPROTO_ICMP:
 		icmptype = pd->hdr.icmp.icmp_type;
 		icmpcode = pd->hdr.icmp.icmp_code;
 		icmpid = pd->hdr.icmp.icmp_id;
 		icmpsum = &pd->hdr.icmp.icmp_cksum;
 
 		if (icmptype == ICMP_UNREACH ||
 		    icmptype == ICMP_SOURCEQUENCH ||
 		    icmptype == ICMP_REDIRECT ||
 		    icmptype == ICMP_TIMXCEED ||
 		    icmptype == ICMP_PARAMPROB)
 			state_icmp++;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case IPPROTO_ICMPV6:
 		icmptype = pd->hdr.icmp6.icmp6_type;
 		icmpcode = pd->hdr.icmp6.icmp6_code;
 		icmpid = pd->hdr.icmp6.icmp6_id;
 		icmpsum = &pd->hdr.icmp6.icmp6_cksum;
 
 		if (icmptype == ICMP6_DST_UNREACH ||
 		    icmptype == ICMP6_PACKET_TOO_BIG ||
 		    icmptype == ICMP6_TIME_EXCEEDED ||
 		    icmptype == ICMP6_PARAM_PROB)
 			state_icmp++;
 		break;
 #endif /* INET6 */
 	}
 
 	if (!state_icmp) {
 		/*
 		 * ICMP query/reply message not related to a TCP/UDP packet.
 		 * Search for an ICMP state.
 		 */
 		key.af = pd->af;
 		key.proto = pd->proto;
 		key.port[0] = key.port[1] = icmpid;
 		if (direction == PF_IN)	{	/* wire side, straight */
 			PF_ACPY(&key.addr[0], pd->src, key.af);
 			PF_ACPY(&key.addr[1], pd->dst, key.af);
 		} else {			/* stack side, reverse */
 			PF_ACPY(&key.addr[1], pd->src, key.af);
 			PF_ACPY(&key.addr[0], pd->dst, key.af);
 		}
 
 		STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 		(*state)->expire = time_uptime;
 		(*state)->timeout = PFTM_ICMP_ERROR_REPLY;
 
 		/* translate source/destination address, if necessary */
 		if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
 			struct pf_state_key *nk = (*state)->key[pd->didx];
 
 			switch (pd->af) {
 #ifdef INET
 			case AF_INET:
 				if (PF_ANEQ(pd->src,
 				    &nk->addr[pd->sidx], AF_INET))
 					pf_change_a(&saddr->v4.s_addr,
 					    pd->ip_sum,
 					    nk->addr[pd->sidx].v4.s_addr, 0);
 
 				if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
 				    AF_INET))
 					pf_change_a(&daddr->v4.s_addr,
 					    pd->ip_sum,
 					    nk->addr[pd->didx].v4.s_addr, 0);
 
 				if (nk->port[0] !=
 				    pd->hdr.icmp.icmp_id) {
 					pd->hdr.icmp.icmp_cksum =
 					    pf_cksum_fixup(
 					    pd->hdr.icmp.icmp_cksum, icmpid,
 					    nk->port[pd->sidx], 0);
 					pd->hdr.icmp.icmp_id =
 					    nk->port[pd->sidx];
 				}
 
 				m_copyback(m, off, ICMP_MINLEN,
 				    (caddr_t )&pd->hdr.icmp);
 				break;
 #endif /* INET */
 #ifdef INET6
 			case AF_INET6:
 				if (PF_ANEQ(pd->src,
 				    &nk->addr[pd->sidx], AF_INET6))
 					pf_change_a6(saddr,
 					    &pd->hdr.icmp6.icmp6_cksum,
 					    &nk->addr[pd->sidx], 0);
 
 				if (PF_ANEQ(pd->dst,
 				    &nk->addr[pd->didx], AF_INET6))
 					pf_change_a6(daddr,
 					    &pd->hdr.icmp6.icmp6_cksum,
 					    &nk->addr[pd->didx], 0);
 
 				m_copyback(m, off, sizeof(struct icmp6_hdr),
 				    (caddr_t )&pd->hdr.icmp6);
 				break;
 #endif /* INET6 */
 			}
 		}
 		return (PF_PASS);
 
 	} else {
 		/*
 		 * ICMP error message in response to a TCP/UDP packet.
 		 * Extract the inner TCP/UDP header and search for that state.
 		 */
 
 		struct pf_pdesc	pd2;
 		bzero(&pd2, sizeof pd2);
 #ifdef INET
 		struct ip	h2;
 #endif /* INET */
 #ifdef INET6
 		struct ip6_hdr	h2_6;
 		int		terminal = 0;
 #endif /* INET6 */
 		int		ipoff2 = 0;
 		int		off2 = 0;
 
 		pd2.af = pd->af;
 		/* Payload packet is from the opposite direction. */
 		pd2.sidx = (direction == PF_IN) ? 1 : 0;
 		pd2.didx = (direction == PF_IN) ? 0 : 1;
 		switch (pd->af) {
 #ifdef INET
 		case AF_INET:
 			/* offset of h2 in mbuf chain */
 			ipoff2 = off + ICMP_MINLEN;
 
 			if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
 			    NULL, reason, pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short "
 				    "(ip)\n"));
 				return (PF_DROP);
 			}
 			/*
 			 * ICMP error messages don't refer to non-first
 			 * fragments
 			 */
 			if (h2.ip_off & htons(IP_OFFMASK)) {
 				REASON_SET(reason, PFRES_FRAG);
 				return (PF_DROP);
 			}
 
 			/* offset of protocol header that follows h2 */
 			off2 = ipoff2 + (h2.ip_hl << 2);
 
 			pd2.proto = h2.ip_p;
 			pd2.src = (struct pf_addr *)&h2.ip_src;
 			pd2.dst = (struct pf_addr *)&h2.ip_dst;
 			pd2.ip_sum = &h2.ip_sum;
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			ipoff2 = off + sizeof(struct icmp6_hdr);
 
 			if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
 			    NULL, reason, pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short "
 				    "(ip6)\n"));
 				return (PF_DROP);
 			}
 			pd2.proto = h2_6.ip6_nxt;
 			pd2.src = (struct pf_addr *)&h2_6.ip6_src;
 			pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
 			pd2.ip_sum = NULL;
 			off2 = ipoff2 + sizeof(h2_6);
 			do {
 				switch (pd2.proto) {
 				case IPPROTO_FRAGMENT:
 					/*
 					 * ICMPv6 error messages for
 					 * non-first fragments
 					 */
 					REASON_SET(reason, PFRES_FRAG);
 					return (PF_DROP);
 				case IPPROTO_AH:
 				case IPPROTO_HOPOPTS:
 				case IPPROTO_ROUTING:
 				case IPPROTO_DSTOPTS: {
 					/* get next header and header length */
 					struct ip6_ext opt6;
 
 					if (!pf_pull_hdr(m, off2, &opt6,
 					    sizeof(opt6), NULL, reason,
 					    pd2.af)) {
 						DPFPRINTF(PF_DEBUG_MISC,
 						    ("pf: ICMPv6 short opt\n"));
 						return (PF_DROP);
 					}
 					if (pd2.proto == IPPROTO_AH)
 						off2 += (opt6.ip6e_len + 2) * 4;
 					else
 						off2 += (opt6.ip6e_len + 1) * 8;
 					pd2.proto = opt6.ip6e_nxt;
 					/* goto the next header */
 					break;
 				}
 				default:
 					terminal++;
 					break;
 				}
 			} while (!terminal);
 			break;
 #endif /* INET6 */
 		}
 
 		if (PF_ANEQ(pd->dst, pd2.src, pd->af)) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				printf("pf: BAD ICMP %d:%d outer dst: ",
 				    icmptype, icmpcode);
 				pf_print_host(pd->src, 0, pd->af);
 				printf(" -> ");
 				pf_print_host(pd->dst, 0, pd->af);
 				printf(" inner src: ");
 				pf_print_host(pd2.src, 0, pd2.af);
 				printf(" -> ");
 				pf_print_host(pd2.dst, 0, pd2.af);
 				printf("\n");
 			}
 			REASON_SET(reason, PFRES_BADSTATE);
 			return (PF_DROP);
 		}
 
 		switch (pd2.proto) {
 		case IPPROTO_TCP: {
 			struct tcphdr		 th;
 			u_int32_t		 seq;
 			struct pf_state_peer	*src, *dst;
 			u_int8_t		 dws;
 			int			 copyback = 0;
 
 			/*
 			 * Only the first 8 bytes of the TCP header can be
 			 * expected. Don't access any TCP header fields after
 			 * th_seq, an ackskew test is not possible.
 			 */
 			if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
 			    pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short "
 				    "(tcp)\n"));
 				return (PF_DROP);
 			}
 
 			key.af = pd2.af;
 			key.proto = IPPROTO_TCP;
 			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
 			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
 			key.port[pd2.sidx] = th.th_sport;
 			key.port[pd2.didx] = th.th_dport;
 
 			STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 			if (direction == (*state)->direction) {
 				src = &(*state)->dst;
 				dst = &(*state)->src;
 			} else {
 				src = &(*state)->src;
 				dst = &(*state)->dst;
 			}
 
 			if (src->wscale && dst->wscale)
 				dws = dst->wscale & PF_WSCALE_MASK;
 			else
 				dws = 0;
 
 			/* Demodulate sequence number */
 			seq = ntohl(th.th_seq) - src->seqdiff;
 			if (src->seqdiff) {
 				pf_change_a(&th.th_seq, icmpsum,
 				    htonl(seq), 0);
 				copyback = 1;
 			}
 
 			if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
 			    (!SEQ_GEQ(src->seqhi, seq) ||
 			    !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
 				if (V_pf_status.debug >= PF_DEBUG_MISC) {
 					printf("pf: BAD ICMP %d:%d ",
 					    icmptype, icmpcode);
 					pf_print_host(pd->src, 0, pd->af);
 					printf(" -> ");
 					pf_print_host(pd->dst, 0, pd->af);
 					printf(" state: ");
 					pf_print_state(*state);
 					printf(" seq=%u\n", seq);
 				}
 				REASON_SET(reason, PFRES_BADSTATE);
 				return (PF_DROP);
 			} else {
 				if (V_pf_status.debug >= PF_DEBUG_MISC) {
 					printf("pf: OK ICMP %d:%d ",
 					    icmptype, icmpcode);
 					pf_print_host(pd->src, 0, pd->af);
 					printf(" -> ");
 					pf_print_host(pd->dst, 0, pd->af);
 					printf(" state: ");
 					pf_print_state(*state);
 					printf(" seq=%u\n", seq);
 				}
 			}
 
 			/* translate source/destination address, if necessary */
 			if ((*state)->key[PF_SK_WIRE] !=
 			    (*state)->key[PF_SK_STACK]) {
 				struct pf_state_key *nk =
 				    (*state)->key[pd->didx];
 
 				if (PF_ANEQ(pd2.src,
 				    &nk->addr[pd2.sidx], pd2.af) ||
 				    nk->port[pd2.sidx] != th.th_sport)
 					pf_change_icmp(pd2.src, &th.th_sport,
 					    daddr, &nk->addr[pd2.sidx],
 					    nk->port[pd2.sidx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, pd2.af);
 
 				if (PF_ANEQ(pd2.dst,
 				    &nk->addr[pd2.didx], pd2.af) ||
 				    nk->port[pd2.didx] != th.th_dport)
 					pf_change_icmp(pd2.dst, &th.th_dport,
 					    saddr, &nk->addr[pd2.didx],
 					    nk->port[pd2.didx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, pd2.af);
 				copyback = 1;
 			}
 
 			if (copyback) {
 				switch (pd2.af) {
 #ifdef INET
 				case AF_INET:
 					m_copyback(m, off, ICMP_MINLEN,
 					    (caddr_t )&pd->hdr.icmp);
 					m_copyback(m, ipoff2, sizeof(h2),
 					    (caddr_t )&h2);
 					break;
 #endif /* INET */
 #ifdef INET6
 				case AF_INET6:
 					m_copyback(m, off,
 					    sizeof(struct icmp6_hdr),
 					    (caddr_t )&pd->hdr.icmp6);
 					m_copyback(m, ipoff2, sizeof(h2_6),
 					    (caddr_t )&h2_6);
 					break;
 #endif /* INET6 */
 				}
 				m_copyback(m, off2, 8, (caddr_t)&th);
 			}
 
 			return (PF_PASS);
 			break;
 		}
 		case IPPROTO_UDP: {
 			struct udphdr		uh;
 
 			if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
 			    NULL, reason, pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short "
 				    "(udp)\n"));
 				return (PF_DROP);
 			}
 
 			key.af = pd2.af;
 			key.proto = IPPROTO_UDP;
 			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
 			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
 			key.port[pd2.sidx] = uh.uh_sport;
 			key.port[pd2.didx] = uh.uh_dport;
 
 			STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 			/* translate source/destination address, if necessary */
 			if ((*state)->key[PF_SK_WIRE] !=
 			    (*state)->key[PF_SK_STACK]) {
 				struct pf_state_key *nk =
 				    (*state)->key[pd->didx];
 
 				if (PF_ANEQ(pd2.src,
 				    &nk->addr[pd2.sidx], pd2.af) ||
 				    nk->port[pd2.sidx] != uh.uh_sport)
 					pf_change_icmp(pd2.src, &uh.uh_sport,
 					    daddr, &nk->addr[pd2.sidx],
 					    nk->port[pd2.sidx], &uh.uh_sum,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 1, pd2.af);
 
 				if (PF_ANEQ(pd2.dst,
 				    &nk->addr[pd2.didx], pd2.af) ||
 				    nk->port[pd2.didx] != uh.uh_dport)
 					pf_change_icmp(pd2.dst, &uh.uh_dport,
 					    saddr, &nk->addr[pd2.didx],
 					    nk->port[pd2.didx], &uh.uh_sum,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 1, pd2.af);
 
 				switch (pd2.af) {
 #ifdef INET
 				case AF_INET:
 					m_copyback(m, off, ICMP_MINLEN,
 					    (caddr_t )&pd->hdr.icmp);
 					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
 					break;
 #endif /* INET */
 #ifdef INET6
 				case AF_INET6:
 					m_copyback(m, off,
 					    sizeof(struct icmp6_hdr),
 					    (caddr_t )&pd->hdr.icmp6);
 					m_copyback(m, ipoff2, sizeof(h2_6),
 					    (caddr_t )&h2_6);
 					break;
 #endif /* INET6 */
 				}
 				m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
 			}
 			return (PF_PASS);
 			break;
 		}
 #ifdef INET
 		case IPPROTO_ICMP: {
 			struct icmp		iih;
 
 			if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
 			    NULL, reason, pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short i"
 				    "(icmp)\n"));
 				return (PF_DROP);
 			}
 
 			key.af = pd2.af;
 			key.proto = IPPROTO_ICMP;
 			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
 			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
 			key.port[0] = key.port[1] = iih.icmp_id;
 
 			STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 			/* translate source/destination address, if necessary */
 			if ((*state)->key[PF_SK_WIRE] !=
 			    (*state)->key[PF_SK_STACK]) {
 				struct pf_state_key *nk =
 				    (*state)->key[pd->didx];
 
 				if (PF_ANEQ(pd2.src,
 				    &nk->addr[pd2.sidx], pd2.af) ||
 				    nk->port[pd2.sidx] != iih.icmp_id)
 					pf_change_icmp(pd2.src, &iih.icmp_id,
 					    daddr, &nk->addr[pd2.sidx],
 					    nk->port[pd2.sidx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, AF_INET);
 
 				if (PF_ANEQ(pd2.dst,
 				    &nk->addr[pd2.didx], pd2.af) ||
 				    nk->port[pd2.didx] != iih.icmp_id)
 					pf_change_icmp(pd2.dst, &iih.icmp_id,
 					    saddr, &nk->addr[pd2.didx],
 					    nk->port[pd2.didx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, AF_INET);
 
 				m_copyback(m, off, ICMP_MINLEN, (caddr_t)&pd->hdr.icmp);
 				m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
 				m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
 			}
 			return (PF_PASS);
 			break;
 		}
 #endif /* INET */
 #ifdef INET6
 		case IPPROTO_ICMPV6: {
 			struct icmp6_hdr	iih;
 
 			if (!pf_pull_hdr(m, off2, &iih,
 			    sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short "
 				    "(icmp6)\n"));
 				return (PF_DROP);
 			}
 
 			key.af = pd2.af;
 			key.proto = IPPROTO_ICMPV6;
 			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
 			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
 			key.port[0] = key.port[1] = iih.icmp6_id;
 
 			STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 			/* translate source/destination address, if necessary */
 			if ((*state)->key[PF_SK_WIRE] !=
 			    (*state)->key[PF_SK_STACK]) {
 				struct pf_state_key *nk =
 				    (*state)->key[pd->didx];
 
 				if (PF_ANEQ(pd2.src,
 				    &nk->addr[pd2.sidx], pd2.af) ||
 				    nk->port[pd2.sidx] != iih.icmp6_id)
 					pf_change_icmp(pd2.src, &iih.icmp6_id,
 					    daddr, &nk->addr[pd2.sidx],
 					    nk->port[pd2.sidx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, AF_INET6);
 
 				if (PF_ANEQ(pd2.dst,
 				    &nk->addr[pd2.didx], pd2.af) ||
 				    nk->port[pd2.didx] != iih.icmp6_id)
 					pf_change_icmp(pd2.dst, &iih.icmp6_id,
 					    saddr, &nk->addr[pd2.didx],
 					    nk->port[pd2.didx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, AF_INET6);
 
 				m_copyback(m, off, sizeof(struct icmp6_hdr),
 				    (caddr_t)&pd->hdr.icmp6);
 				m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
 				m_copyback(m, off2, sizeof(struct icmp6_hdr),
 				    (caddr_t)&iih);
 			}
 			return (PF_PASS);
 			break;
 		}
 #endif /* INET6 */
 		default: {
 			key.af = pd2.af;
 			key.proto = pd2.proto;
 			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
 			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
 			key.port[0] = key.port[1] = 0;
 
 			STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 			/* translate source/destination address, if necessary */
 			if ((*state)->key[PF_SK_WIRE] !=
 			    (*state)->key[PF_SK_STACK]) {
 				struct pf_state_key *nk =
 				    (*state)->key[pd->didx];
 
 				if (PF_ANEQ(pd2.src,
 				    &nk->addr[pd2.sidx], pd2.af))
 					pf_change_icmp(pd2.src, NULL, daddr,
 					    &nk->addr[pd2.sidx], 0, NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, pd2.af);
 
 				if (PF_ANEQ(pd2.dst,
 				    &nk->addr[pd2.didx], pd2.af))
 					pf_change_icmp(pd2.dst, NULL, saddr,
 					    &nk->addr[pd2.didx], 0, NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, pd2.af);
 
 				switch (pd2.af) {
 #ifdef INET
 				case AF_INET:
 					m_copyback(m, off, ICMP_MINLEN,
 					    (caddr_t)&pd->hdr.icmp);
 					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
 					break;
 #endif /* INET */
 #ifdef INET6
 				case AF_INET6:
 					m_copyback(m, off,
 					    sizeof(struct icmp6_hdr),
 					    (caddr_t )&pd->hdr.icmp6);
 					m_copyback(m, ipoff2, sizeof(h2_6),
 					    (caddr_t )&h2_6);
 					break;
 #endif /* INET6 */
 				}
 			}
 			return (PF_PASS);
 			break;
 		}
 		}
 	}
 }
 
 static int
 pf_test_state_other(struct pf_kstate **state, int direction, struct pfi_kkif *kif,
     struct mbuf *m, struct pf_pdesc *pd)
 {
 	struct pf_state_peer	*src, *dst;
 	struct pf_state_key_cmp	 key;
 	uint8_t			 psrc, pdst;
 
 	bzero(&key, sizeof(key));
 	key.af = pd->af;
 	key.proto = pd->proto;
 	if (direction == PF_IN)	{
 		PF_ACPY(&key.addr[0], pd->src, key.af);
 		PF_ACPY(&key.addr[1], pd->dst, key.af);
 		key.port[0] = key.port[1] = 0;
 	} else {
 		PF_ACPY(&key.addr[1], pd->src, key.af);
 		PF_ACPY(&key.addr[0], pd->dst, key.af);
 		key.port[1] = key.port[0] = 0;
 	}
 
 	STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 	if (direction == (*state)->direction) {
 		src = &(*state)->src;
 		dst = &(*state)->dst;
 		psrc = PF_PEER_SRC;
 		pdst = PF_PEER_DST;
 	} else {
 		src = &(*state)->dst;
 		dst = &(*state)->src;
 		psrc = PF_PEER_DST;
 		pdst = PF_PEER_SRC;
 	}
 
 	/* update states */
 	if (src->state < PFOTHERS_SINGLE)
 		pf_set_protostate(*state, psrc, PFOTHERS_SINGLE);
 	if (dst->state == PFOTHERS_SINGLE)
 		pf_set_protostate(*state, pdst, PFOTHERS_MULTIPLE);
 
 	/* update expire time */
 	(*state)->expire = time_uptime;
 	if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
 		(*state)->timeout = PFTM_OTHER_MULTIPLE;
 	else
 		(*state)->timeout = PFTM_OTHER_SINGLE;
 
 	/* translate source/destination address, if necessary */
 	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
 		struct pf_state_key *nk = (*state)->key[pd->didx];
 
 		KASSERT(nk, ("%s: nk is null", __func__));
 		KASSERT(pd, ("%s: pd is null", __func__));
 		KASSERT(pd->src, ("%s: pd->src is null", __func__));
 		KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
 		switch (pd->af) {
 #ifdef INET
 		case AF_INET:
 			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
 				pf_change_a(&pd->src->v4.s_addr,
 				    pd->ip_sum,
 				    nk->addr[pd->sidx].v4.s_addr,
 				    0);
 
 			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
 				pf_change_a(&pd->dst->v4.s_addr,
 				    pd->ip_sum,
 				    nk->addr[pd->didx].v4.s_addr,
 				    0);
 
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
 				PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
 
 			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
 				PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
 #endif /* INET6 */
 		}
 	}
 	return (PF_PASS);
 }
 
 /*
  * ipoff and off are measured from the start of the mbuf chain.
  * h must be at "ipoff" on the mbuf chain.
  */
 void *
 pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
     u_short *actionp, u_short *reasonp, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET: {
 		struct ip	*h = mtod(m, struct ip *);
 		u_int16_t	 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
 
 		if (fragoff) {
 			if (fragoff >= len)
 				ACTION_SET(actionp, PF_PASS);
 			else {
 				ACTION_SET(actionp, PF_DROP);
 				REASON_SET(reasonp, PFRES_FRAG);
 			}
 			return (NULL);
 		}
 		if (m->m_pkthdr.len < off + len ||
 		    ntohs(h->ip_len) < off + len) {
 			ACTION_SET(actionp, PF_DROP);
 			REASON_SET(reasonp, PFRES_SHORT);
 			return (NULL);
 		}
 		break;
 	}
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6: {
 		struct ip6_hdr	*h = mtod(m, struct ip6_hdr *);
 
 		if (m->m_pkthdr.len < off + len ||
 		    (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
 		    (unsigned)(off + len)) {
 			ACTION_SET(actionp, PF_DROP);
 			REASON_SET(reasonp, PFRES_SHORT);
 			return (NULL);
 		}
 		break;
 	}
 #endif /* INET6 */
 	}
 	m_copydata(m, off, len, p);
 	return (p);
 }
 
 int
 pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kkif *kif,
     int rtableid)
 {
 	struct ifnet		*ifp;
 
 	/*
 	 * Skip check for addresses with embedded interface scope,
 	 * as they would always match anyway.
 	 */
 	if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6))
 		return (1);
 
 	if (af != AF_INET && af != AF_INET6)
 		return (0);
 
 	/* Skip checks for ipsec interfaces */
 	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
 		return (1);
 
 	ifp = (kif != NULL) ? kif->pfik_ifp : NULL;
 
 	switch (af) {
 #ifdef INET6
 	case AF_INET6:
 		return (fib6_check_urpf(rtableid, &addr->v6, 0, NHR_NONE,
 		    ifp));
 #endif
 #ifdef INET
 	case AF_INET:
 		return (fib4_check_urpf(rtableid, addr->v4, 0, NHR_NONE,
 		    ifp));
 #endif
 	}
 
 	return (0);
 }
 
 #ifdef INET
 static void
 pf_route(struct mbuf **m, struct pf_krule *r, int dir, struct ifnet *oifp,
     struct pf_kstate *s, struct pf_pdesc *pd, struct inpcb *inp)
 {
 	struct mbuf		*m0, *m1, *md;
 	struct sockaddr_in	dst;
 	struct ip		*ip;
 	struct ifnet		*ifp = NULL;
 	struct pf_addr		 naddr;
 	struct pf_ksrc_node	*sn = NULL;
 	int			 error = 0;
 	uint16_t		 ip_len, ip_off;
 
 	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
 	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
 	    __func__));
 
 	if ((pd->pf_mtag == NULL &&
 	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
 	    pd->pf_mtag->routed++ > 3) {
 		m0 = *m;
 		*m = NULL;
 		goto bad_locked;
 	}
 
 	if (r->rt == PF_DUPTO) {
 		if ((pd->pf_mtag->flags & PF_DUPLICATED)) {
 			if (s == NULL) {
 				ifp = r->rpool.cur->kif ?
 				    r->rpool.cur->kif->pfik_ifp : NULL;
 			} else {
 				ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
 				/* If pfsync'd */
 				if (ifp == NULL)
 					ifp = r->rpool.cur->kif ?
 					    r->rpool.cur->kif->pfik_ifp : NULL;
 				PF_STATE_UNLOCK(s);
 			}
 			if (ifp == oifp) {
 				/* When the 2nd interface is not skipped */
 				return;
 			} else {
 				m0 = *m;
 				*m = NULL;
 				goto bad;
 			}
 		} else {
 			pd->pf_mtag->flags |= PF_DUPLICATED;
 			if (((m0 = m_dup(*m, M_NOWAIT)) == NULL)) {
 				if (s)
 					PF_STATE_UNLOCK(s);
 				return;
 			}
 		}
 	} else {
 		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
 			pf_dummynet(pd, dir, s, r, m);
 			if (s)
 				PF_STATE_UNLOCK(s);
 			return;
 		}
 		m0 = *m;
 	}
 
 	ip = mtod(m0, struct ip *);
 
 	bzero(&dst, sizeof(dst));
 	dst.sin_family = AF_INET;
 	dst.sin_len = sizeof(dst);
 	dst.sin_addr = ip->ip_dst;
 
 	bzero(&naddr, sizeof(naddr));
 
 	if (s == NULL) {
 		if (TAILQ_EMPTY(&r->rpool.list)) {
 			DPFPRINTF(PF_DEBUG_URGENT,
 			    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
 			goto bad_locked;
 		}
 		pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
 		    &naddr, NULL, &sn);
 		if (!PF_AZERO(&naddr, AF_INET))
 			dst.sin_addr.s_addr = naddr.v4.s_addr;
 		ifp = r->rpool.cur->kif ?
 		    r->rpool.cur->kif->pfik_ifp : NULL;
 	} else {
 		if (!PF_AZERO(&s->rt_addr, AF_INET))
 			dst.sin_addr.s_addr =
 			    s->rt_addr.v4.s_addr;
 		ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
 		PF_STATE_UNLOCK(s);
 	}
 	/* If pfsync'd */
 	if (ifp == NULL)
 		ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
 	if (ifp == NULL)
 		goto bad;
 
 	if (dir == PF_IN) {
 		if (pf_test(PF_OUT, 0, ifp, &m0, inp) != PF_PASS)
 			goto bad;
 		else if (m0 == NULL)
 			goto done;
 		if (m0->m_len < sizeof(struct ip)) {
 			DPFPRINTF(PF_DEBUG_URGENT,
 			    ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
 			goto bad;
 		}
 		ip = mtod(m0, struct ip *);
 	}
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		m0->m_flags |= M_SKIP_FIREWALL;
 
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	/* Copied from FreeBSD 10.0-CURRENT ip_output. */
 	m0->m_pkthdr.csum_flags |= CSUM_IP;
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
 		sctp_delayed_cksum(m0, (uint32_t)(ip->ip_hl << 2));
 		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, we can just send directly.
 	 */
 	if (ip_len <= ifp->if_mtu ||
 	    (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
 		ip->ip_sum = 0;
 		if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
 			ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
 			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 		m_clrprotoflags(m0);	/* Avoid confusing lower layers. */
 
 		md = m0;
 		error = pf_dummynet_route(pd, dir, s, r, ifp, sintosa(&dst), &md);
 		if (md != NULL)
 			error = (*ifp->if_output)(ifp, md, sintosa(&dst), NULL);
 		goto done;
 	}
 
 	/* Balk when DF bit is set or the interface didn't support TSO. */
 	if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
 		error = EMSGSIZE;
 		KMOD_IPSTAT_INC(ips_cantfrag);
 		if (r->rt != PF_DUPTO) {
 			if (s && pd->nat_rule != NULL)
 				PACKET_UNDO_NAT(m0, pd,
 				    (ip->ip_hl << 2) + (ip_off & IP_OFFMASK),
 				    s, dir);
 
 			icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
 			    ifp->if_mtu);
 			goto done;
 		} else
 			goto bad;
 	}
 
 	error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist);
 	if (error)
 		goto bad;
 
 	for (; m0; m0 = m1) {
 		m1 = m0->m_nextpkt;
 		m0->m_nextpkt = NULL;
 		if (error == 0) {
 			m_clrprotoflags(m0);
 			md = m0;
 			error = pf_dummynet_route(pd, dir, s, r, ifp,
 			    sintosa(&dst), &md);
 			if (md != NULL)
 				error = (*ifp->if_output)(ifp, md,
 				    sintosa(&dst), NULL);
 		} else
 			m_freem(m0);
 	}
 
 	if (error == 0)
 		KMOD_IPSTAT_INC(ips_fragmented);
 
 done:
 	if (r->rt != PF_DUPTO)
 		*m = NULL;
 	return;
 
 bad_locked:
 	if (s)
 		PF_STATE_UNLOCK(s);
 bad:
 	m_freem(m0);
 	goto done;
 }
 #endif /* INET */
 
 #ifdef INET6
 static void
 pf_route6(struct mbuf **m, struct pf_krule *r, int dir, struct ifnet *oifp,
     struct pf_kstate *s, struct pf_pdesc *pd, struct inpcb *inp)
 {
 	struct mbuf		*m0, *md;
 	struct sockaddr_in6	dst;
 	struct ip6_hdr		*ip6;
 	struct ifnet		*ifp = NULL;
 	struct pf_addr		 naddr;
 	struct pf_ksrc_node	*sn = NULL;
 
 	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
 	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
 	    __func__));
 
 	if ((pd->pf_mtag == NULL &&
 	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
 	    pd->pf_mtag->routed++ > 3) {
 		m0 = *m;
 		*m = NULL;
 		goto bad_locked;
 	}
 
 	if (r->rt == PF_DUPTO) {
 		if ((pd->pf_mtag->flags & PF_DUPLICATED)) {
 			if (s == NULL) {
 				ifp = r->rpool.cur->kif ?
 				    r->rpool.cur->kif->pfik_ifp : NULL;
 			} else {
 				ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
 				/* If pfsync'd */
 				if (ifp == NULL)
 					ifp = r->rpool.cur->kif ?
 					    r->rpool.cur->kif->pfik_ifp : NULL;
 				PF_STATE_UNLOCK(s);
 			}
 			if (ifp == oifp) {
 				/* When the 2nd interface is not skipped */
 				return;
 			} else {
 				m0 = *m;
 				*m = NULL;
 				goto bad;
 			}
 		} else {
 			pd->pf_mtag->flags |= PF_DUPLICATED;
 			if (((m0 = m_dup(*m, M_NOWAIT)) == NULL)) {
 				if (s)
 					PF_STATE_UNLOCK(s);
 				return;
 			}
 		}
 	} else {
 		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
 			pf_dummynet(pd, dir, s, r, m);
 			if (s)
 				PF_STATE_UNLOCK(s);
 			return;
 		}
 		m0 = *m;
 	}
 
 	ip6 = mtod(m0, struct ip6_hdr *);
 
 	bzero(&dst, sizeof(dst));
 	dst.sin6_family = AF_INET6;
 	dst.sin6_len = sizeof(dst);
 	dst.sin6_addr = ip6->ip6_dst;
 
 	bzero(&naddr, sizeof(naddr));
 
 	if (s == NULL) {
 		if (TAILQ_EMPTY(&r->rpool.list)) {
 			DPFPRINTF(PF_DEBUG_URGENT,
 			    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
 			goto bad_locked;
 		}
 		pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
 		    &naddr, NULL, &sn);
 		if (!PF_AZERO(&naddr, AF_INET6))
 			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
 			    &naddr, AF_INET6);
 		ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
 	} else {
 		if (!PF_AZERO(&s->rt_addr, AF_INET6))
 			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
 			    &s->rt_addr, AF_INET6);
 		ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
 	}
 
 	if (s)
 		PF_STATE_UNLOCK(s);
 
 	/* If pfsync'd */
 	if (ifp == NULL)
 		ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
 	if (ifp == NULL)
 		goto bad;
 
 	if (dir == PF_IN) {
 		if (pf_test6(PF_OUT, 0, ifp, &m0, inp) != PF_PASS)
 			goto bad;
 		else if (m0 == NULL)
 			goto done;
 		if (m0->m_len < sizeof(struct ip6_hdr)) {
 			DPFPRINTF(PF_DEBUG_URGENT,
 			    ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
 			    __func__));
 			goto bad;
 		}
 		ip6 = mtod(m0, struct ip6_hdr *);
 	}
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		m0->m_flags |= M_SKIP_FIREWALL;
 
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 &
 	    ~ifp->if_hwassist) {
 		uint32_t plen = m0->m_pkthdr.len - sizeof(*ip6);
 		in6_delayed_cksum(m0, plen, sizeof(struct ip6_hdr));
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 	}
 
 	/*
 	 * If the packet is too large for the outgoing interface,
 	 * send back an icmp6 error.
 	 */
 	if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
 		dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
 	if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu) {
 		md = m0;
 		pf_dummynet_route(pd, dir, s, r, ifp, sintosa(&dst), &md);
 		if (md != NULL)
 			nd6_output_ifp(ifp, ifp, md, &dst, NULL);
 	}
 	else {
 		in6_ifstat_inc(ifp, ifs6_in_toobig);
 		if (r->rt != PF_DUPTO) {
 			if (s && pd->nat_rule != NULL)
 				PACKET_UNDO_NAT(m0, pd,
 				    ((caddr_t)ip6 - m0->m_data) +
 				    sizeof(struct ip6_hdr), s, dir);
 
 			icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
 		} else
 			goto bad;
 	}
 
 done:
 	if (r->rt != PF_DUPTO)
 		*m = NULL;
 	return;
 
 bad_locked:
 	if (s)
 		PF_STATE_UNLOCK(s);
 bad:
 	m_freem(m0);
 	goto done;
 }
 #endif /* INET6 */
 
 /*
  * FreeBSD supports cksum offloads for the following drivers.
  *  em(4), fxp(4), lge(4), nge(4), re(4), ti(4), txp(4), xl(4)
  *
  * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
  *  network driver performed cksum including pseudo header, need to verify
  *   csum_data
  * CSUM_DATA_VALID :
  *  network driver performed cksum, needs to additional pseudo header
  *  cksum computation with partial csum_data(i.e. lack of H/W support for
  *  pseudo header, for instance sk(4) and possibly gem(4))
  *
  * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
  * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
  * TCP/UDP layer.
  * Also, set csum_data to 0xffff to force cksum validation.
  */
 static int
 pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
 {
 	u_int16_t sum = 0;
 	int hw_assist = 0;
 	struct ip *ip;
 
 	if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
 		return (1);
 	if (m->m_pkthdr.len < off + len)
 		return (1);
 
 	switch (p) {
 	case IPPROTO_TCP:
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
 				sum = m->m_pkthdr.csum_data;
 			} else {
 				ip = mtod(m, struct ip *);
 				sum = in_pseudo(ip->ip_src.s_addr,
 				ip->ip_dst.s_addr, htonl((u_short)len +
 				m->m_pkthdr.csum_data + IPPROTO_TCP));
 			}
 			sum ^= 0xffff;
 			++hw_assist;
 		}
 		break;
 	case IPPROTO_UDP:
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
 				sum = m->m_pkthdr.csum_data;
 			} else {
 				ip = mtod(m, struct ip *);
 				sum = in_pseudo(ip->ip_src.s_addr,
 				ip->ip_dst.s_addr, htonl((u_short)len +
 				m->m_pkthdr.csum_data + IPPROTO_UDP));
 			}
 			sum ^= 0xffff;
 			++hw_assist;
 		}
 		break;
 	case IPPROTO_ICMP:
 #ifdef INET6
 	case IPPROTO_ICMPV6:
 #endif /* INET6 */
 		break;
 	default:
 		return (1);
 	}
 
 	if (!hw_assist) {
 		switch (af) {
 		case AF_INET:
 			if (p == IPPROTO_ICMP) {
 				if (m->m_len < off)
 					return (1);
 				m->m_data += off;
 				m->m_len -= off;
 				sum = in_cksum(m, len);
 				m->m_data -= off;
 				m->m_len += off;
 			} else {
 				if (m->m_len < sizeof(struct ip))
 					return (1);
 				sum = in4_cksum(m, p, off, len);
 			}
 			break;
 #ifdef INET6
 		case AF_INET6:
 			if (m->m_len < sizeof(struct ip6_hdr))
 				return (1);
 			sum = in6_cksum(m, p, off, len);
 			break;
 #endif /* INET6 */
 		default:
 			return (1);
 		}
 	}
 	if (sum) {
 		switch (p) {
 		case IPPROTO_TCP:
 		    {
 			KMOD_TCPSTAT_INC(tcps_rcvbadsum);
 			break;
 		    }
 		case IPPROTO_UDP:
 		    {
 			KMOD_UDPSTAT_INC(udps_badsum);
 			break;
 		    }
 #ifdef INET
 		case IPPROTO_ICMP:
 		    {
 			KMOD_ICMPSTAT_INC(icps_checksum);
 			break;
 		    }
 #endif
 #ifdef INET6
 		case IPPROTO_ICMPV6:
 		    {
 			KMOD_ICMP6STAT_INC(icp6s_checksum);
 			break;
 		    }
 #endif /* INET6 */
 		}
 		return (1);
 	} else {
 		if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
 			m->m_pkthdr.csum_flags |=
 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 	}
 	return (0);
 }
 
 static bool
 pf_pdesc_to_dnflow(int dir, const struct pf_pdesc *pd,
     const struct pf_krule *r, const struct pf_kstate *s,
     struct ip_fw_args *dnflow)
 {
 	int dndir = r->direction;
 
 	if (s && dndir == PF_INOUT) {
 		dndir = s->direction;
 	} else if (dndir == PF_INOUT) {
 		/* Assume primary direction. Happens when we've set dnpipe in
 		 * the ethernet level code. */
 		dndir = dir;
 	}
 
 	memset(dnflow, 0, sizeof(*dnflow));
 
 	if (pd->dport != NULL)
 		dnflow->f_id.dst_port = ntohs(*pd->dport);
 	if (pd->sport != NULL)
 		dnflow->f_id.src_port = ntohs(*pd->sport);
 
 	if (dir == PF_IN)
 		dnflow->flags |= IPFW_ARGS_IN;
 	else
 		dnflow->flags |= IPFW_ARGS_OUT;
 
 	if (dir != dndir && pd->act.dnrpipe) {
 		dnflow->rule.info = pd->act.dnrpipe;
 	}
 	else if (dir == dndir && pd->act.dnpipe) {
 		dnflow->rule.info = pd->act.dnpipe;
 	}
 	else {
 		return (false);
 	}
 
 	dnflow->rule.info |= IPFW_IS_DUMMYNET;
 	if (r->free_flags & PFRULE_DN_IS_PIPE || pd->act.flags & PFRULE_DN_IS_PIPE)
 		dnflow->rule.info |= IPFW_IS_PIPE;
 
 	dnflow->f_id.proto = pd->proto;
 	dnflow->f_id.extra = dnflow->rule.info;
 	switch (pd->af) {
 	case AF_INET:
 		dnflow->f_id.addr_type = 4;
 		dnflow->f_id.src_ip = ntohl(pd->src->v4.s_addr);
 		dnflow->f_id.dst_ip = ntohl(pd->dst->v4.s_addr);
 		break;
 	case AF_INET6:
 		dnflow->flags |= IPFW_ARGS_IP6;
 		dnflow->f_id.addr_type = 6;
 		dnflow->f_id.src_ip6 = pd->src->v6;
 		dnflow->f_id.dst_ip6 = pd->dst->v6;
 		break;
 	default:
 		panic("Invalid AF");
 		break;
 	}
 
 	return (true);
 }
 
 int
 pf_test_eth(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0,
     struct inpcb *inp)
 {
 	struct pfi_kkif		*kif;
 	struct mbuf		*m = *m0;
 
 	M_ASSERTPKTHDR(m);
 	MPASS(ifp->if_vnet == curvnet);
 	NET_EPOCH_ASSERT();
 
 	if (!V_pf_status.running)
 		return (PF_PASS);
 
 	kif = (struct pfi_kkif *)ifp->if_pf_kif;
 
 	if (kif == NULL) {
 		DPFPRINTF(PF_DEBUG_URGENT,
 		    ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
 		return (PF_DROP);
 	}
 	if (kif->pfik_flags & PFI_IFLAG_SKIP)
 		return (PF_PASS);
 
 	if (m->m_flags & M_SKIP_FIREWALL)
 		return (PF_PASS);
 
 	/* Stateless! */
 	return (pf_test_eth_rule(dir, kif, m0));
 }
 
 static int
 pf_dummynet(struct pf_pdesc *pd, int dir, struct pf_kstate *s,
     struct pf_krule *r, struct mbuf **m0)
 {
 	return (pf_dummynet_route(pd, dir, s, r, NULL, NULL, m0));
 }
 
 static int
 pf_dummynet_route(struct pf_pdesc *pd, int dir, struct pf_kstate *s,
     struct pf_krule *r, struct ifnet *ifp, struct sockaddr *sa,
     struct mbuf **m0)
 {
 	NET_EPOCH_ASSERT();
 
 	if (s && (s->dnpipe || s->dnrpipe)) {
 		pd->act.dnpipe = s->dnpipe;
 		pd->act.dnrpipe = s->dnrpipe;
 		pd->act.flags = s->state_flags;
 	} else if (r->dnpipe || r->dnrpipe) {
 		pd->act.dnpipe = r->dnpipe;
 		pd->act.dnrpipe = r->dnrpipe;
 		pd->act.flags = r->free_flags;
 	}
 	if (pd->act.dnpipe || pd->act.dnrpipe) {
 		struct ip_fw_args dnflow;
 		if (ip_dn_io_ptr == NULL) {
 			m_freem(*m0);
 			*m0 = NULL;
 			return (ENOMEM);
 		}
 
 		if (pd->pf_mtag == NULL &&
 		    ((pd->pf_mtag = pf_get_mtag(*m0)) == NULL)) {
 			m_freem(*m0);
 			*m0 = NULL;
 			return (ENOMEM);
 		}
 
 		if (ifp != NULL) {
 			pd->pf_mtag->flags |= PF_TAG_ROUTE_TO;
 
 			pd->pf_mtag->if_index = ifp->if_index;
 			pd->pf_mtag->if_idxgen = ifp->if_idxgen;
 
 			MPASS(sa != NULL);
 
 			if (pd->af == AF_INET)
 				memcpy(&pd->pf_mtag->dst, sa,
 				    sizeof(struct sockaddr_in));
 			else
 				memcpy(&pd->pf_mtag->dst, sa,
 				    sizeof(struct sockaddr_in6));
 		}
 
 		if (pf_pdesc_to_dnflow(dir, pd, r, s, &dnflow)) {
 			pd->pf_mtag->flags |= PF_TAG_DUMMYNET;
 			ip_dn_io_ptr(m0, &dnflow);
 			if (*m0 != NULL)
 				pd->pf_mtag->flags &= ~PF_TAG_DUMMYNET;
 		}
 	}
 
 	return (0);
 }
 
 #ifdef INET
 int
 pf_test(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
 {
 	struct pfi_kkif		*kif;
 	u_short			 action, reason = 0, log = 0;
 	struct mbuf		*m = *m0;
 	struct ip		*h = NULL;
 	struct m_tag		*ipfwtag;
 	struct pf_krule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
 	struct pf_kstate	*s = NULL;
 	struct pf_kruleset	*ruleset = NULL;
 	struct pf_pdesc		 pd;
 	int			 off, dirndx, pqid = 0;
 
 	PF_RULES_RLOCK_TRACKER;
 	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir));
 	M_ASSERTPKTHDR(m);
 
 	if (!V_pf_status.running)
 		return (PF_PASS);
 
 	PF_RULES_RLOCK();
 
 	kif = (struct pfi_kkif *)ifp->if_pf_kif;
 
 	if (__predict_false(kif == NULL)) {
 		DPFPRINTF(PF_DEBUG_URGENT,
 		    ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
 		PF_RULES_RUNLOCK();
 		return (PF_DROP);
 	}
 	if (kif->pfik_flags & PFI_IFLAG_SKIP) {
 		PF_RULES_RUNLOCK();
 		return (PF_PASS);
 	}
 
 	if (m->m_flags & M_SKIP_FIREWALL) {
 		PF_RULES_RUNLOCK();
 		return (PF_PASS);
 	}
 
 	memset(&pd, 0, sizeof(pd));
 	pd.pf_mtag = pf_find_mtag(m);
 
 	if (pd.pf_mtag != NULL && (pd.pf_mtag->flags & PF_TAG_ROUTE_TO)) {
 		pd.pf_mtag->flags &= ~PF_TAG_ROUTE_TO;
 
 		ifp = ifnet_byindexgen(pd.pf_mtag->if_index,
 		    pd.pf_mtag->if_idxgen);
 		if (ifp == NULL || ifp->if_flags & IFF_DYING) {
 			PF_RULES_RUNLOCK();
 			m_freem(*m0);
 			*m0 = NULL;
 			return (PF_PASS);
 		}
 		PF_RULES_RUNLOCK();
 		(ifp->if_output)(ifp, m, sintosa(&pd.pf_mtag->dst), NULL);
 		*m0 = NULL;
 		return (PF_PASS);
 	}
 
 	if (pd.pf_mtag && pd.pf_mtag->dnpipe) {
 		pd.act.dnpipe = pd.pf_mtag->dnpipe;
 		pd.act.flags = pd.pf_mtag->dnflags;
 	}
 
 	if (ip_dn_io_ptr != NULL && pd.pf_mtag != NULL &&
 	    pd.pf_mtag->flags & PF_TAG_DUMMYNET) {
 		/* Dummynet re-injects packets after they've
 		 * completed their delay. We've already
 		 * processed them, so pass unconditionally. */
 
 		/* But only once. We may see the packet multiple times (e.g.
 		 * PFIL_IN/PFIL_OUT). */
 		pd.pf_mtag->flags &= ~PF_TAG_DUMMYNET;
 		PF_RULES_RUNLOCK();
 
 		return (PF_PASS);
 	}
 
 	if (__predict_false(ip_divert_ptr != NULL) &&
 	    ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) {
 		struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1);
 		if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) {
 			if (pd.pf_mtag == NULL &&
 			    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
 				action = PF_DROP;
 				goto done;
 			}
 			pd.pf_mtag->flags |= PF_PACKET_LOOPED;
 			m_tag_delete(m, ipfwtag);
 		}
 		if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) {
 			m->m_flags |= M_FASTFWD_OURS;
 			pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT;
 		}
 	} else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
 		/* We do IP header normalization and packet reassembly here */
 		action = PF_DROP;
 		goto done;
 	}
 	m = *m0;	/* pf_normalize messes with m0 */
 	h = mtod(m, struct ip *);
 
 	off = h->ip_hl << 2;
 	if (off < (int)sizeof(struct ip)) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_SHORT);
 		log = 1;
 		goto done;
 	}
 
 	pd.src = (struct pf_addr *)&h->ip_src;
 	pd.dst = (struct pf_addr *)&h->ip_dst;
 	pd.sport = pd.dport = NULL;
 	pd.ip_sum = &h->ip_sum;
 	pd.proto_sum = NULL;
 	pd.proto = h->ip_p;
 	pd.dir = dir;
 	pd.sidx = (dir == PF_IN) ? 0 : 1;
 	pd.didx = (dir == PF_IN) ? 1 : 0;
 	pd.af = AF_INET;
 	pd.tos = h->ip_tos & ~IPTOS_ECN_MASK;
 	pd.tot_len = ntohs(h->ip_len);
 
 	/* handle fragments that didn't get reassembled by normalization */
 	if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
 		action = pf_test_fragment(&r, dir, kif, m, h,
 		    &pd, &a, &ruleset);
 		goto done;
 	}
 
 	switch (h->ip_p) {
 	case IPPROTO_TCP: {
 		if (!pf_pull_hdr(m, off, &pd.hdr.tcp, sizeof(pd.hdr.tcp),
 		    &action, &reason, AF_INET)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		pd.p_len = pd.tot_len - off - (pd.hdr.tcp.th_off << 2);
 
 		pd.sport = &pd.hdr.tcp.th_sport;
 		pd.dport = &pd.hdr.tcp.th_dport;
 
 		/* Respond to SYN with a syncookie. */
 		if ((pd.hdr.tcp.th_flags & (TH_SYN|TH_ACK|TH_RST)) == TH_SYN &&
 		    pd.dir == PF_IN && pf_synflood_check(&pd)) {
 			pf_syncookie_send(m, off, &pd);
 			action = PF_DROP;
 			break;
 		}
 
 		if ((pd.hdr.tcp.th_flags & TH_ACK) && pd.p_len == 0)
 			pqid = 1;
 		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
 		if (action == PF_DROP)
 			goto done;
 		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
 		    &reason);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL) {
 			/* Validate remote SYN|ACK, re-create original SYN if
 			 * valid. */
 			if ((pd.hdr.tcp.th_flags & (TH_SYN|TH_ACK|TH_RST)) ==
 			    TH_ACK && pf_syncookie_validate(&pd) &&
 			    pd.dir == PF_IN) {
 				struct mbuf *msyn;
 
 				msyn = pf_syncookie_recreate_syn(h->ip_ttl,
 				    off,&pd);
 				if (msyn == NULL) {
 					action = PF_DROP;
 					break;
 				}
 
 				action = pf_test(dir, pflags, ifp, &msyn, inp);
 				m_freem(msyn);
 
 				if (action == PF_PASS) {
 					action = pf_test_state_tcp(&s, dir,
 					    kif, m, off, h, &pd, &reason);
 					if (action != PF_PASS || s == NULL) {
 						action = PF_DROP;
 						break;
 					}
 
 					s->src.seqhi = ntohl(pd.hdr.tcp.th_ack)
 					    - 1;
 					s->src.seqlo = ntohl(pd.hdr.tcp.th_seq)
 					    - 1;
 					pf_set_protostate(s, PF_PEER_SRC,
 					    PF_TCPS_PROXY_DST);
 
 					action = pf_synproxy(&pd, &s, &reason);
 					if (action != PF_PASS)
 						break;
 				}
 				break;
 			}
 			else {
 				action = pf_test_rule(&r, &s, dir, kif, m, off,
 				    &pd, &a, &ruleset, inp);
 			}
 		}
 		break;
 	}
 
 	case IPPROTO_UDP: {
 		if (!pf_pull_hdr(m, off, &pd.hdr.udp, sizeof(pd.hdr.udp),
 		    &action, &reason, AF_INET)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		pd.sport = &pd.hdr.udp.uh_sport;
 		pd.dport = &pd.hdr.udp.uh_dport;
 		if (pd.hdr.udp.uh_dport == 0 ||
 		    ntohs(pd.hdr.udp.uh_ulen) > m->m_pkthdr.len - off ||
 		    ntohs(pd.hdr.udp.uh_ulen) < sizeof(struct udphdr)) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_SHORT);
 			goto done;
 		}
 		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 	case IPPROTO_ICMP: {
 		if (!pf_pull_hdr(m, off, &pd.hdr.icmp, ICMP_MINLEN,
 		    &action, &reason, AF_INET)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
 		    &reason);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 #ifdef INET6
 	case IPPROTO_ICMPV6: {
 		action = PF_DROP;
 		DPFPRINTF(PF_DEBUG_MISC,
 		    ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
 		goto done;
 	}
 #endif
 
 	default:
 		action = pf_test_state_other(&s, dir, kif, m, &pd);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 done:
 	PF_RULES_RUNLOCK();
 	if (action == PF_PASS && h->ip_hl > 5 &&
 	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_IPOPTIONS);
 		log = r->log;
 		DPFPRINTF(PF_DEBUG_MISC,
 		    ("pf: dropping packet with ip options\n"));
 	}
 
 	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_MEMORY);
 	}
 	if (r->rtableid >= 0)
 		M_SETFIB(m, r->rtableid);
 
 	if (r->scrub_flags & PFSTATE_SETPRIO) {
 		if (pd.tos & IPTOS_LOWDELAY)
 			pqid = 1;
 		if (vlan_set_pcp(m, r->set_prio[pqid])) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 			log = 1;
 			DPFPRINTF(PF_DEBUG_MISC,
 			    ("pf: failed to allocate 802.1q mtag\n"));
 		}
 	}
 
 #ifdef ALTQ
 	if (s && s->qid) {
 		pd.act.pqid = s->pqid;
 		pd.act.qid = s->qid;
 	} else if (r->qid) {
 		pd.act.pqid = r->pqid;
 		pd.act.qid = r->qid;
 	}
 	if (action == PF_PASS && pd.act.qid) {
 		if (pd.pf_mtag == NULL &&
 		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 		} else {
 			if (s != NULL)
 				pd.pf_mtag->qid_hash = pf_state_hash(s);
 			if (pqid || (pd.tos & IPTOS_LOWDELAY))
 				pd.pf_mtag->qid = pd.act.pqid;
 			else
 				pd.pf_mtag->qid = pd.act.qid;
 			/* Add hints for ecn. */
 			pd.pf_mtag->hdr = h;
 		}
 	}
 #endif /* ALTQ */
 
 	/*
 	 * connections redirected to loopback should not match sockets
 	 * bound specifically to loopback due to security implications,
 	 * see tcp_input() and in_pcblookup_listen().
 	 */
 	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
 	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
 	    (s->nat_rule.ptr->action == PF_RDR ||
 	    s->nat_rule.ptr->action == PF_BINAT) &&
 	    IN_LOOPBACK(ntohl(pd.dst->v4.s_addr)))
 		m->m_flags |= M_SKIP_FIREWALL;
 
 	if (__predict_false(ip_divert_ptr != NULL) && action == PF_PASS &&
 	    r->divert.port && !PACKET_LOOPED(&pd)) {
 		ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
 		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
 		if (ipfwtag != NULL) {
 			((struct ipfw_rule_ref *)(ipfwtag+1))->info =
 			    ntohs(r->divert.port);
 			((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir;
 
 			if (s)
 				PF_STATE_UNLOCK(s);
 
 			m_tag_prepend(m, ipfwtag);
 			if (m->m_flags & M_FASTFWD_OURS) {
 				if (pd.pf_mtag == NULL &&
 				    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
 					action = PF_DROP;
 					REASON_SET(&reason, PFRES_MEMORY);
 					log = 1;
 					DPFPRINTF(PF_DEBUG_MISC,
 					    ("pf: failed to allocate tag\n"));
 				} else {
 					pd.pf_mtag->flags |=
 					    PF_FASTFWD_OURS_PRESENT;
 					m->m_flags &= ~M_FASTFWD_OURS;
 				}
 			}
 			ip_divert_ptr(*m0, dir == PF_IN);
 			*m0 = NULL;
 
 			return (action);
 		} else {
 			/* XXX: ipfw has the same behaviour! */
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 			log = 1;
 			DPFPRINTF(PF_DEBUG_MISC,
 			    ("pf: failed to allocate divert tag\n"));
 		}
 	}
 
 	if (log) {
 		struct pf_krule *lr;
 
 		if (s != NULL && s->nat_rule.ptr != NULL &&
 		    s->nat_rule.ptr->log & PF_LOG_ALL)
 			lr = s->nat_rule.ptr;
 		else
 			lr = r;
 		PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd,
 		    (s == NULL));
 	}
 
 	pf_counter_u64_critical_enter();
 	pf_counter_u64_add_protected(&kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS],
 	    pd.tot_len);
 	pf_counter_u64_add_protected(&kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS],
 	    1);
 
 	if (action == PF_PASS || r->action == PF_DROP) {
 		dirndx = (dir == PF_OUT);
 		pf_counter_u64_add_protected(&r->packets[dirndx], 1);
 		pf_counter_u64_add_protected(&r->bytes[dirndx], pd.tot_len);
 		pf_update_timestamp(r);
 
 		if (a != NULL) {
 			pf_counter_u64_add_protected(&a->packets[dirndx], 1);
 			pf_counter_u64_add_protected(&a->bytes[dirndx], pd.tot_len);
 		}
 		if (s != NULL) {
 			if (s->nat_rule.ptr != NULL) {
 				pf_counter_u64_add_protected(&s->nat_rule.ptr->packets[dirndx],
 				    1);
 				pf_counter_u64_add_protected(&s->nat_rule.ptr->bytes[dirndx],
 				    pd.tot_len);
 			}
 			if (s->src_node != NULL) {
 				counter_u64_add(s->src_node->packets[dirndx],
 				    1);
 				counter_u64_add(s->src_node->bytes[dirndx],
 				    pd.tot_len);
 			}
 			if (s->nat_src_node != NULL) {
 				counter_u64_add(s->nat_src_node->packets[dirndx],
 				    1);
 				counter_u64_add(s->nat_src_node->bytes[dirndx],
 				    pd.tot_len);
 			}
 			dirndx = (dir == s->direction) ? 0 : 1;
 			s->packets[dirndx]++;
 			s->bytes[dirndx] += pd.tot_len;
 		}
 		tr = r;
 		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
 		if (nr != NULL && r == &V_pf_default_rule)
 			tr = nr;
 		if (tr->src.addr.type == PF_ADDR_TABLE)
 			pfr_update_stats(tr->src.addr.p.tbl,
 			    (s == NULL) ? pd.src :
 			    &s->key[(s->direction == PF_IN)]->
 				addr[(s->direction == PF_OUT)],
 			    pd.af, pd.tot_len, dir == PF_OUT,
 			    r->action == PF_PASS, tr->src.neg);
 		if (tr->dst.addr.type == PF_ADDR_TABLE)
 			pfr_update_stats(tr->dst.addr.p.tbl,
 			    (s == NULL) ? pd.dst :
 			    &s->key[(s->direction == PF_IN)]->
 				addr[(s->direction == PF_IN)],
 			    pd.af, pd.tot_len, dir == PF_OUT,
 			    r->action == PF_PASS, tr->dst.neg);
 	}
 	pf_counter_u64_critical_exit();
 
 	switch (action) {
 	case PF_SYNPROXY_DROP:
 		m_freem(*m0);
 	case PF_DEFER:
 		*m0 = NULL;
 		action = PF_PASS;
 		break;
 	case PF_DROP:
 		m_freem(*m0);
 		*m0 = NULL;
 		break;
 	default:
 		/* pf_route() returns unlocked. */
 		if (r->rt) {
 			pf_route(m0, r, dir, kif->pfik_ifp, s, &pd, inp);
 			return (action);
 		}
 		if (pf_dummynet(&pd, dir, s, r, m0) != 0) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 		}
 		break;
 	}
 
 	SDT_PROBE4(pf, ip, test, done, action, reason, r, s);
 
 	if (s)
 		PF_STATE_UNLOCK(s);
 
 	return (action);
 }
 #endif /* INET */
 
 #ifdef INET6
 int
 pf_test6(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
 {
 	struct pfi_kkif		*kif;
 	u_short			 action, reason = 0, log = 0;
 	struct mbuf		*m = *m0, *n = NULL;
 	struct m_tag		*mtag;
 	struct ip6_hdr		*h = NULL;
 	struct pf_krule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
 	struct pf_kstate	*s = NULL;
 	struct pf_kruleset	*ruleset = NULL;
 	struct pf_pdesc		 pd;
 	int			 off, terminal = 0, dirndx, rh_cnt = 0, pqid = 0;
 
 	PF_RULES_RLOCK_TRACKER;
 	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: bad direction %d\n", __func__, dir));
 	M_ASSERTPKTHDR(m);
 
 	if (!V_pf_status.running)
 		return (PF_PASS);
 
 	PF_RULES_RLOCK();
 
 	kif = (struct pfi_kkif *)ifp->if_pf_kif;
 	if (__predict_false(kif == NULL)) {
 		DPFPRINTF(PF_DEBUG_URGENT,
 		    ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
 		PF_RULES_RUNLOCK();
 		return (PF_DROP);
 	}
 	if (kif->pfik_flags & PFI_IFLAG_SKIP) {
 		PF_RULES_RUNLOCK();
 		return (PF_PASS);
 	}
 
 	if (m->m_flags & M_SKIP_FIREWALL) {
 		PF_RULES_RUNLOCK();
 		return (PF_PASS);
 	}
 
 	memset(&pd, 0, sizeof(pd));
 	pd.pf_mtag = pf_find_mtag(m);
 
 	if (pd.pf_mtag != NULL && (pd.pf_mtag->flags & PF_TAG_ROUTE_TO)) {
 		pd.pf_mtag->flags &= ~PF_TAG_ROUTE_TO;
 
 		ifp = ifnet_byindexgen(pd.pf_mtag->if_index,
 		    pd.pf_mtag->if_idxgen);
 		if (ifp == NULL || ifp->if_flags & IFF_DYING) {
 			PF_RULES_RUNLOCK();
 			m_freem(*m0);
 			*m0 = NULL;
 			return (PF_PASS);
 		}
 		PF_RULES_RUNLOCK();
 		nd6_output_ifp(ifp, ifp, m,
                     (struct sockaddr_in6 *)&pd.pf_mtag->dst, NULL);
 		*m0 = NULL;
 		return (PF_PASS);
 	}
 
 	if (pd.pf_mtag && pd.pf_mtag->dnpipe) {
 		pd.act.dnpipe = pd.pf_mtag->dnpipe;
 		pd.act.flags = pd.pf_mtag->dnflags;
 	}
 
 	if (ip_dn_io_ptr != NULL && pd.pf_mtag != NULL &&
 	    pd.pf_mtag->flags & PF_TAG_DUMMYNET) {
 		pd.pf_mtag->flags &= ~PF_TAG_DUMMYNET;
 		/* Dummynet re-injects packets after they've
 		 * completed their delay. We've already
 		 * processed them, so pass unconditionally. */
 		PF_RULES_RUNLOCK();
 		return (PF_PASS);
 	}
 
 	/* We do IP header normalization and packet reassembly here */
 	if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
 		action = PF_DROP;
 		goto done;
 	}
 	m = *m0;	/* pf_normalize messes with m0 */
 	h = mtod(m, struct ip6_hdr *);
 
 	/*
 	 * we do not support jumbogram.  if we keep going, zero ip6_plen
 	 * will do something bad, so drop the packet for now.
 	 */
 	if (htons(h->ip6_plen) == 0) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_NORM);	/*XXX*/
 		goto done;
 	}
 
 	pd.src = (struct pf_addr *)&h->ip6_src;
 	pd.dst = (struct pf_addr *)&h->ip6_dst;
 	pd.sport = pd.dport = NULL;
 	pd.ip_sum = NULL;
 	pd.proto_sum = NULL;
 	pd.dir = dir;
 	pd.sidx = (dir == PF_IN) ? 0 : 1;
 	pd.didx = (dir == PF_IN) ? 1 : 0;
 	pd.af = AF_INET6;
 	pd.tos = IPV6_DSCP(h);
 	pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
 
 	off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
 	pd.proto = h->ip6_nxt;
 	do {
 		switch (pd.proto) {
 		case IPPROTO_FRAGMENT:
 			action = pf_test_fragment(&r, dir, kif, m, h,
 			    &pd, &a, &ruleset);
 			if (action == PF_DROP)
 				REASON_SET(&reason, PFRES_FRAG);
 			goto done;
 		case IPPROTO_ROUTING: {
 			struct ip6_rthdr rthdr;
 
 			if (rh_cnt++) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: IPv6 more than one rthdr\n"));
 				action = PF_DROP;
 				REASON_SET(&reason, PFRES_IPOPTIONS);
 				log = 1;
 				goto done;
 			}
 			if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
 			    &reason, pd.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: IPv6 short rthdr\n"));
 				action = PF_DROP;
 				REASON_SET(&reason, PFRES_SHORT);
 				log = 1;
 				goto done;
 			}
 			if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: IPv6 rthdr0\n"));
 				action = PF_DROP;
 				REASON_SET(&reason, PFRES_IPOPTIONS);
 				log = 1;
 				goto done;
 			}
 			/* FALLTHROUGH */
 		}
 		case IPPROTO_AH:
 		case IPPROTO_HOPOPTS:
 		case IPPROTO_DSTOPTS: {
 			/* get next header and header length */
 			struct ip6_ext	opt6;
 
 			if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
 			    NULL, &reason, pd.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: IPv6 short opt\n"));
 				action = PF_DROP;
 				log = 1;
 				goto done;
 			}
 			if (pd.proto == IPPROTO_AH)
 				off += (opt6.ip6e_len + 2) * 4;
 			else
 				off += (opt6.ip6e_len + 1) * 8;
 			pd.proto = opt6.ip6e_nxt;
 			/* goto the next header */
 			break;
 		}
 		default:
 			terminal++;
 			break;
 		}
 	} while (!terminal);
 
 	/* if there's no routing header, use unmodified mbuf for checksumming */
 	if (!n)
 		n = m;
 
 	switch (pd.proto) {
 	case IPPROTO_TCP: {
 		if (!pf_pull_hdr(m, off, &pd.hdr.tcp, sizeof(pd.hdr.tcp),
 		    &action, &reason, AF_INET6)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		pd.p_len = pd.tot_len - off - (pd.hdr.tcp.th_off << 2);
 		pd.sport = &pd.hdr.tcp.th_sport;
 		pd.dport = &pd.hdr.tcp.th_dport;
 		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
 		if (action == PF_DROP)
 			goto done;
 		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
 		    &reason);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 	case IPPROTO_UDP: {
 		if (!pf_pull_hdr(m, off, &pd.hdr.udp, sizeof(pd.hdr.udp),
 		    &action, &reason, AF_INET6)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		pd.sport = &pd.hdr.udp.uh_sport;
 		pd.dport = &pd.hdr.udp.uh_dport;
 		if (pd.hdr.udp.uh_dport == 0 ||
 		    ntohs(pd.hdr.udp.uh_ulen) > m->m_pkthdr.len - off ||
 		    ntohs(pd.hdr.udp.uh_ulen) < sizeof(struct udphdr)) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_SHORT);
 			goto done;
 		}
 		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 	case IPPROTO_ICMP: {
 		action = PF_DROP;
 		DPFPRINTF(PF_DEBUG_MISC,
 		    ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
 		goto done;
 	}
 
 	case IPPROTO_ICMPV6: {
 		if (!pf_pull_hdr(m, off, &pd.hdr.icmp6, sizeof(pd.hdr.icmp6),
 		    &action, &reason, AF_INET6)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		action = pf_test_state_icmp(&s, dir, kif,
 		    m, off, h, &pd, &reason);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 	default:
 		action = pf_test_state_other(&s, dir, kif, m, &pd);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 done:
 	PF_RULES_RUNLOCK();
 	if (n != m) {
 		m_freem(n);
 		n = NULL;
 	}
 
 	/* handle dangerous IPv6 extension headers. */
 	if (action == PF_PASS && rh_cnt &&
 	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_IPOPTIONS);
 		log = r->log;
 		DPFPRINTF(PF_DEBUG_MISC,
 		    ("pf: dropping packet with dangerous v6 headers\n"));
 	}
 
 	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_MEMORY);
 	}
 	if (r->rtableid >= 0)
 		M_SETFIB(m, r->rtableid);
 
 	if (r->scrub_flags & PFSTATE_SETPRIO) {
 		if (pd.tos & IPTOS_LOWDELAY)
 			pqid = 1;
 		if (vlan_set_pcp(m, r->set_prio[pqid])) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 			log = 1;
 			DPFPRINTF(PF_DEBUG_MISC,
 			    ("pf: failed to allocate 802.1q mtag\n"));
 		}
 	}
 
 #ifdef ALTQ
 	if (s && s->qid) {
 		pd.act.pqid = s->pqid;
 		pd.act.qid = s->qid;
 	} else if (r->qid) {
 		pd.act.pqid = r->pqid;
 		pd.act.qid = r->qid;
 	}
 	if (action == PF_PASS && pd.act.qid) {
 		if (pd.pf_mtag == NULL &&
 		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 		} else {
 			if (s != NULL)
 				pd.pf_mtag->qid_hash = pf_state_hash(s);
 			if (pd.tos & IPTOS_LOWDELAY)
 				pd.pf_mtag->qid = pd.act.pqid;
 			else
 				pd.pf_mtag->qid = pd.act.qid;
 			/* Add hints for ecn. */
 			pd.pf_mtag->hdr = h;
 		}
 	}
 #endif /* ALTQ */
 
 	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
 	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
 	    (s->nat_rule.ptr->action == PF_RDR ||
 	    s->nat_rule.ptr->action == PF_BINAT) &&
 	    IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
 		m->m_flags |= M_SKIP_FIREWALL;
 
 	/* XXX: Anybody working on it?! */
 	if (r->divert.port)
 		printf("pf: divert(9) is not supported for IPv6\n");
 
 	if (log) {
 		struct pf_krule *lr;
 
 		if (s != NULL && s->nat_rule.ptr != NULL &&
 		    s->nat_rule.ptr->log & PF_LOG_ALL)
 			lr = s->nat_rule.ptr;
 		else
 			lr = r;
 		PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset,
 		    &pd, (s == NULL));
 	}
 
 	pf_counter_u64_critical_enter();
 	pf_counter_u64_add_protected(&kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS],
 	    pd.tot_len);
 	pf_counter_u64_add_protected(&kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS],
 	    1);
 
 	if (action == PF_PASS || r->action == PF_DROP) {
 		dirndx = (dir == PF_OUT);
 		pf_counter_u64_add_protected(&r->packets[dirndx], 1);
 		pf_counter_u64_add_protected(&r->bytes[dirndx], pd.tot_len);
 		if (a != NULL) {
 			pf_counter_u64_add_protected(&a->packets[dirndx], 1);
 			pf_counter_u64_add_protected(&a->bytes[dirndx], pd.tot_len);
 		}
 		if (s != NULL) {
 			if (s->nat_rule.ptr != NULL) {
 				pf_counter_u64_add_protected(&s->nat_rule.ptr->packets[dirndx],
 				    1);
 				pf_counter_u64_add_protected(&s->nat_rule.ptr->bytes[dirndx],
 				    pd.tot_len);
 			}
 			if (s->src_node != NULL) {
 				counter_u64_add(s->src_node->packets[dirndx],
 				    1);
 				counter_u64_add(s->src_node->bytes[dirndx],
 				    pd.tot_len);
 			}
 			if (s->nat_src_node != NULL) {
 				counter_u64_add(s->nat_src_node->packets[dirndx],
 				    1);
 				counter_u64_add(s->nat_src_node->bytes[dirndx],
 				    pd.tot_len);
 			}
 			dirndx = (dir == s->direction) ? 0 : 1;
 			s->packets[dirndx]++;
 			s->bytes[dirndx] += pd.tot_len;
 		}
 		tr = r;
 		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
 		if (nr != NULL && r == &V_pf_default_rule)
 			tr = nr;
 		if (tr->src.addr.type == PF_ADDR_TABLE)
 			pfr_update_stats(tr->src.addr.p.tbl,
 			    (s == NULL) ? pd.src :
 			    &s->key[(s->direction == PF_IN)]->addr[0],
 			    pd.af, pd.tot_len, dir == PF_OUT,
 			    r->action == PF_PASS, tr->src.neg);
 		if (tr->dst.addr.type == PF_ADDR_TABLE)
 			pfr_update_stats(tr->dst.addr.p.tbl,
 			    (s == NULL) ? pd.dst :
 			    &s->key[(s->direction == PF_IN)]->addr[1],
 			    pd.af, pd.tot_len, dir == PF_OUT,
 			    r->action == PF_PASS, tr->dst.neg);
 	}
 	pf_counter_u64_critical_exit();
 
 	switch (action) {
 	case PF_SYNPROXY_DROP:
 		m_freem(*m0);
 	case PF_DEFER:
 		*m0 = NULL;
 		action = PF_PASS;
 		break;
 	case PF_DROP:
 		m_freem(*m0);
 		*m0 = NULL;
 		break;
 	default:
 		/* pf_route6() returns unlocked. */
 		if (r->rt) {
 			pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd, inp);
 			return (action);
 		}
 		if (pf_dummynet(&pd, dir, s, r, m0) != 0) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 		}
 		break;
 	}
 
 	if (s)
 		PF_STATE_UNLOCK(s);
 
 	/* If reassembled packet passed, create new fragments. */
 	if (action == PF_PASS && *m0 && dir == PF_OUT &&
 	    (mtag = m_tag_find(m, PF_REASSEMBLED, NULL)) != NULL)
 		action = pf_refragment6(ifp, m0, mtag);
 
 	SDT_PROBE4(pf, ip, test6, done, action, reason, r, s);
 
 	return (action);
 }
 #endif /* INET6 */
diff --git a/sys/netpfil/pf/pf_if.c b/sys/netpfil/pf/pf_if.c
index 71bd215d3d24..37cf9ffaff64 100644
--- a/sys/netpfil/pf/pf_if.c
+++ b/sys/netpfil/pf/pf_if.c
@@ -1,1141 +1,1142 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2001 Daniel Hartmeier
  * Copyright (c) 2003 Cedric Berger
  * Copyright (c) 2005 Henning Brauer <henning@openbsd.org>
  * Copyright (c) 2005 Ryan McBride <mcbride@openbsd.org>
  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  *    - Redistributions of source code must retain the above copyright
  *      notice, this list of conditions and the following disclaimer.
  *    - Redistributions in binary form must reproduce the above
  *      copyright notice, this list of conditions and the following
  *      disclaimer in the documentation and/or other materials provided
  *      with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  *	$OpenBSD: pf_if.c,v 1.54 2008/06/14 16:55:28 mk Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/vnet.h>
 #include <net/pfvar.h>
 #include <net/route.h>
 
 VNET_DEFINE(struct pfi_kkif *,	 pfi_all);
 VNET_DEFINE_STATIC(long, pfi_update);
 #define	V_pfi_update	VNET(pfi_update)
 #define PFI_BUFFER_MAX	0x10000
 
 VNET_DECLARE(int, pf_vnet_active);
 #define V_pf_vnet_active	VNET(pf_vnet_active)
 
 VNET_DEFINE_STATIC(struct pfr_addr *, pfi_buffer);
 VNET_DEFINE_STATIC(int, pfi_buffer_cnt);
 VNET_DEFINE_STATIC(int,	pfi_buffer_max);
 #define	V_pfi_buffer		 VNET(pfi_buffer)
 #define	V_pfi_buffer_cnt	 VNET(pfi_buffer_cnt)
 #define	V_pfi_buffer_max	 VNET(pfi_buffer_max)
 
 #ifdef PF_WANT_32_TO_64_COUNTER
 VNET_DEFINE(struct allkiflist_head, pf_allkiflist);
 VNET_DEFINE(size_t, pf_allkifcount);
 VNET_DEFINE(struct pfi_kkif *, pf_kifmarker);
 #endif
 
 eventhandler_tag	 pfi_attach_cookie;
 eventhandler_tag	 pfi_detach_cookie;
 eventhandler_tag	 pfi_attach_group_cookie;
 eventhandler_tag	 pfi_change_group_cookie;
 eventhandler_tag	 pfi_detach_group_cookie;
 eventhandler_tag	 pfi_ifaddr_event_cookie;
 
 static void	 pfi_attach_ifnet(struct ifnet *, struct pfi_kkif *);
 static void	 pfi_attach_ifgroup(struct ifg_group *, struct pfi_kkif *);
 
 static void	 pfi_kkif_update(struct pfi_kkif *);
 static void	 pfi_dynaddr_update(struct pfi_dynaddr *dyn);
 static void	 pfi_table_update(struct pfr_ktable *, struct pfi_kkif *, int,
 		    int);
 static void	 pfi_instance_add(struct ifnet *, int, int);
 static void	 pfi_address_add(struct sockaddr *, int, int);
 static int	 pfi_kkif_compare(struct pfi_kkif *, struct pfi_kkif *);
 static int	 pfi_skip_if(const char *, struct pfi_kkif *);
 static int	 pfi_unmask(void *);
 static void	 pfi_attach_ifnet_event(void * __unused, struct ifnet *);
 static void	 pfi_detach_ifnet_event(void * __unused, struct ifnet *);
 static void	 pfi_attach_group_event(void * __unused, struct ifg_group *);
 static void	 pfi_change_group_event(void * __unused, char *);
 static void	 pfi_detach_group_event(void * __unused, struct ifg_group *);
 static void	 pfi_ifaddr_event(void * __unused, struct ifnet *);
 
 RB_HEAD(pfi_ifhead, pfi_kkif);
 static RB_PROTOTYPE(pfi_ifhead, pfi_kkif, pfik_tree, pfi_kkif_compare);
 static RB_GENERATE(pfi_ifhead, pfi_kkif, pfik_tree, pfi_kkif_compare);
 VNET_DEFINE_STATIC(struct pfi_ifhead, pfi_ifs);
 #define	V_pfi_ifs	VNET(pfi_ifs)
 
 #define	PFI_BUFFER_MAX		0x10000
 MALLOC_DEFINE(PFI_MTYPE, "pf_ifnet", "pf(4) interface database");
 
 LIST_HEAD(pfi_list, pfi_kkif);
 VNET_DEFINE_STATIC(struct pfi_list, pfi_unlinked_kifs);
 #define	V_pfi_unlinked_kifs	VNET(pfi_unlinked_kifs)
 static struct mtx pfi_unlnkdkifs_mtx;
 MTX_SYSINIT(pfi_unlnkdkifs_mtx, &pfi_unlnkdkifs_mtx, "pf unlinked interfaces",
     MTX_DEF);
 
 void
 pfi_initialize_vnet(void)
 {
 	struct pfi_list kifs = LIST_HEAD_INITIALIZER();
 	struct epoch_tracker et;
 	struct pfi_kkif *kif;
 	struct ifg_group *ifg;
 	struct ifnet *ifp;
 	int nkifs;
 
 	V_pfi_buffer_max = 64;
 	V_pfi_buffer = malloc(V_pfi_buffer_max * sizeof(*V_pfi_buffer),
 	    PFI_MTYPE, M_WAITOK);
 
 	nkifs = 1;	/* one for V_pfi_all */
 	IFNET_RLOCK();
 	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		nkifs++;
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		nkifs++;
 
 	for (int n = 0; n < nkifs; n++) {
 		kif = pf_kkif_create(M_WAITOK);
 		LIST_INSERT_HEAD(&kifs, kif, pfik_list);
 	}
 
 	NET_EPOCH_ENTER(et);
 	PF_RULES_WLOCK();
 	kif = LIST_FIRST(&kifs);
 	LIST_REMOVE(kif, pfik_list);
 	V_pfi_all = pfi_kkif_attach(kif, IFG_ALL);
 	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next) {
 		kif = LIST_FIRST(&kifs);
 		LIST_REMOVE(kif, pfik_list);
 		pfi_attach_ifgroup(ifg, kif);
 	}
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		kif = LIST_FIRST(&kifs);
 		LIST_REMOVE(kif, pfik_list);
 		pfi_attach_ifnet(ifp, kif);
 	}
 	PF_RULES_WUNLOCK();
 	NET_EPOCH_EXIT(et);
 	IFNET_RUNLOCK();
 
 	MPASS(LIST_EMPTY(&kifs));
 }
 
 void
 pfi_initialize(void)
 {
 
 	pfi_attach_cookie = EVENTHANDLER_REGISTER(ifnet_arrival_event,
 	    pfi_attach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
 	pfi_detach_cookie = EVENTHANDLER_REGISTER(ifnet_departure_event,
 	    pfi_detach_ifnet_event, NULL, EVENTHANDLER_PRI_ANY);
 	pfi_attach_group_cookie = EVENTHANDLER_REGISTER(group_attach_event,
 	    pfi_attach_group_event, NULL, EVENTHANDLER_PRI_ANY);
 	pfi_change_group_cookie = EVENTHANDLER_REGISTER(group_change_event,
 	    pfi_change_group_event, NULL, EVENTHANDLER_PRI_ANY);
 	pfi_detach_group_cookie = EVENTHANDLER_REGISTER(group_detach_event,
 	    pfi_detach_group_event, NULL, EVENTHANDLER_PRI_ANY);
 	pfi_ifaddr_event_cookie = EVENTHANDLER_REGISTER(ifaddr_event,
 	    pfi_ifaddr_event, NULL, EVENTHANDLER_PRI_ANY);
 }
 
 void
 pfi_cleanup_vnet(void)
 {
 	struct pfi_kkif *kif;
 
 	PF_RULES_WASSERT();
 
 	V_pfi_all = NULL;
 	while ((kif = RB_MIN(pfi_ifhead, &V_pfi_ifs))) {
 		RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif);
 		if (kif->pfik_group)
 			kif->pfik_group->ifg_pf_kif = NULL;
 		if (kif->pfik_ifp) {
 			if_rele(kif->pfik_ifp);
 			kif->pfik_ifp->if_pf_kif = NULL;
 		}
 		pf_kkif_free(kif);
 	}
 
 	mtx_lock(&pfi_unlnkdkifs_mtx);
 	while ((kif = LIST_FIRST(&V_pfi_unlinked_kifs))) {
 		LIST_REMOVE(kif, pfik_list);
 		pf_kkif_free(kif);
 	}
 	mtx_unlock(&pfi_unlnkdkifs_mtx);
 
 	free(V_pfi_buffer, PFI_MTYPE);
 }
 
 void
 pfi_cleanup(void)
 {
 
 	EVENTHANDLER_DEREGISTER(ifnet_arrival_event, pfi_attach_cookie);
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event, pfi_detach_cookie);
 	EVENTHANDLER_DEREGISTER(group_attach_event, pfi_attach_group_cookie);
 	EVENTHANDLER_DEREGISTER(group_change_event, pfi_change_group_cookie);
 	EVENTHANDLER_DEREGISTER(group_detach_event, pfi_detach_group_cookie);
 	EVENTHANDLER_DEREGISTER(ifaddr_event, pfi_ifaddr_event_cookie);
 }
 
 struct pfi_kkif*
 pf_kkif_create(int flags)
 {
 	struct pfi_kkif *kif;
 #ifdef PF_WANT_32_TO_64_COUNTER
 	bool wowned;
 #endif
 
 	kif = malloc(sizeof(*kif), PFI_MTYPE, flags | M_ZERO);
 	if (! kif)
 		return (kif);
 
 	for (int i = 0; i < 2; i++) {
 		for (int j = 0; j < 2; j++) {
 			for (int k = 0; k < 2; k++) {
 				if (pf_counter_u64_init(&kif->pfik_packets[i][j][k], flags) != 0) {
 					pf_kkif_free(kif);
 					return (NULL);
 				}
 
 				if (pf_counter_u64_init(&kif->pfik_bytes[i][j][k], flags) != 0) {
 					pf_kkif_free(kif);
 					return (NULL);
 				}
 			}
 		}
 	}
 
 #ifdef PF_WANT_32_TO_64_COUNTER
 	wowned = PF_RULES_WOWNED();
 	if (!wowned)
 		PF_RULES_WLOCK();
 	LIST_INSERT_HEAD(&V_pf_allkiflist, kif, pfik_allkiflist);
 	V_pf_allkifcount++;
 	if (!wowned)
 		PF_RULES_WUNLOCK();
 #endif
 
 	return (kif);
 }
 
 void
 pf_kkif_free(struct pfi_kkif *kif)
 {
 #ifdef PF_WANT_32_TO_64_COUNTER
 	bool wowned;
 #endif
 
 	if (! kif)
 		return;
 
 #ifdef PF_WANT_32_TO_64_COUNTER
 	wowned = PF_RULES_WOWNED();
 	if (!wowned)
 		PF_RULES_WLOCK();
 	LIST_REMOVE(kif, pfik_allkiflist);
 	V_pf_allkifcount--;
 	if (!wowned)
 		PF_RULES_WUNLOCK();
 #endif
 
 	for (int i = 0; i < 2; i++) {
 		for (int j = 0; j < 2; j++) {
 			for (int k = 0; k < 2; k++) {
 				pf_counter_u64_deinit(&kif->pfik_packets[i][j][k]);
 				pf_counter_u64_deinit(&kif->pfik_bytes[i][j][k]);
 			}
 		}
 	}
 
 	free(kif, PFI_MTYPE);
 }
 
 void
 pf_kkif_zero(struct pfi_kkif *kif)
 {
 
 	for (int i = 0; i < 2; i++) {
 		for (int j = 0; j < 2; j++) {
 			for (int k = 0; k < 2; k++) {
 				pf_counter_u64_zero(&kif->pfik_packets[i][j][k]);
 				pf_counter_u64_zero(&kif->pfik_bytes[i][j][k]);
 			}
 		}
 	}
 	kif->pfik_tzero = time_second;
 }
 
 struct pfi_kkif *
 pfi_kkif_find(const char *kif_name)
 {
 	struct pfi_kif_cmp s;
 
 	PF_RULES_ASSERT();
 
 	memset(&s, 0, sizeof(s));
 	strlcpy(s.pfik_name, kif_name, sizeof(s.pfik_name));
 
 	return (RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kkif *)&s));
 }
 
 struct pfi_kkif *
 pfi_kkif_attach(struct pfi_kkif *kif, const char *kif_name)
 {
 	struct pfi_kkif *kif1;
 
 	PF_RULES_WASSERT();
 	KASSERT(kif != NULL, ("%s: null kif", __func__));
 
 	kif1 = pfi_kkif_find(kif_name);
 	if (kif1 != NULL) {
 		pf_kkif_free(kif);
 		return (kif1);
 	}
 
 	pf_kkif_zero(kif);
 	strlcpy(kif->pfik_name, kif_name, sizeof(kif->pfik_name));
 	/*
 	 * It seems that the value of time_second is in unintialzied state
 	 * when pf sets interface statistics clear time in boot phase if pf
 	 * was statically linked to kernel. Instead of setting the bogus
 	 * time value have pfi_get_ifaces handle this case. In
 	 * pfi_get_ifaces it uses time_second if it sees the time is 0.
 	 */
 	kif->pfik_tzero = time_second > 1 ? time_second : 0;
 	TAILQ_INIT(&kif->pfik_dynaddrs);
 
 	RB_INSERT(pfi_ifhead, &V_pfi_ifs, kif);
 
 	return (kif);
 }
 
 void
 pfi_kkif_ref(struct pfi_kkif *kif)
 {
 
 	PF_RULES_WASSERT();
 	kif->pfik_rulerefs++;
 }
 
 static void
 pfi_kkif_remove_if_unref(struct pfi_kkif *kif)
 {
 
 	PF_RULES_WASSERT();
 
 	if (kif->pfik_rulerefs > 0)
 		return;
 
 	/* kif referencing an existing ifnet or group or holding flags should
 	 * exist. */
 	if (kif->pfik_ifp != NULL || kif->pfik_group != NULL ||
 	    kif == V_pfi_all || kif->pfik_flags != 0)
 		return;
 
 	RB_REMOVE(pfi_ifhead, &V_pfi_ifs, kif);
 
 	kif->pfik_flags |= PFI_IFLAG_REFS;
 
 	mtx_lock(&pfi_unlnkdkifs_mtx);
 	LIST_INSERT_HEAD(&V_pfi_unlinked_kifs, kif, pfik_list);
 	mtx_unlock(&pfi_unlnkdkifs_mtx);
 }
 
 void
 pfi_kkif_unref(struct pfi_kkif *kif)
 {
 
 	PF_RULES_WASSERT();
 	KASSERT(kif->pfik_rulerefs > 0, ("%s: %p has zero refs", __func__, kif));
 
 	kif->pfik_rulerefs--;
 
 	pfi_kkif_remove_if_unref(kif);
 }
 
 void
 pfi_kkif_purge(void)
 {
 	struct pfi_kkif *kif, *kif1;
 
 	/*
 	 * Do naive mark-and-sweep garbage collecting of old kifs.
 	 * Reference flag is raised by pf_purge_expired_states().
 	 */
 	mtx_lock(&pfi_unlnkdkifs_mtx);
 	LIST_FOREACH_SAFE(kif, &V_pfi_unlinked_kifs, pfik_list, kif1) {
 		if (!(kif->pfik_flags & PFI_IFLAG_REFS)) {
 			LIST_REMOVE(kif, pfik_list);
 			pf_kkif_free(kif);
 		} else
 			kif->pfik_flags &= ~PFI_IFLAG_REFS;
 	}
 	mtx_unlock(&pfi_unlnkdkifs_mtx);
 }
 
 int
 pfi_kkif_match(struct pfi_kkif *rule_kif, struct pfi_kkif *packet_kif)
 {
 	struct ifg_list	*p;
 
 	NET_EPOCH_ASSERT();
 
 	if (rule_kif == NULL || rule_kif == packet_kif)
 		return (1);
 
 	if (rule_kif->pfik_group != NULL) {
 		CK_STAILQ_FOREACH(p, &packet_kif->pfik_ifp->if_groups, ifgl_next)
 			if (p->ifgl_group == rule_kif->pfik_group)
 				return (1);
 	}
 
 	return (0);
 }
 
 static void
 pfi_attach_ifnet(struct ifnet *ifp, struct pfi_kkif *kif)
 {
 
 	PF_RULES_WASSERT();
 
 	V_pfi_update++;
 	kif = pfi_kkif_attach(kif, ifp->if_xname);
 	if_ref(ifp);
 	kif->pfik_ifp = ifp;
 	ifp->if_pf_kif = kif;
 	pfi_kkif_update(kif);
 }
 
 static void
 pfi_attach_ifgroup(struct ifg_group *ifg, struct pfi_kkif *kif)
 {
 
 	PF_RULES_WASSERT();
 
 	V_pfi_update++;
 	kif = pfi_kkif_attach(kif, ifg->ifg_group);
 	kif->pfik_group = ifg;
 	ifg->ifg_pf_kif = kif;
 }
 
 int
 pfi_match_addr(struct pfi_dynaddr *dyn, struct pf_addr *a, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		switch (dyn->pfid_acnt4) {
 		case 0:
 			return (0);
 		case 1:
 			return (PF_MATCHA(0, &dyn->pfid_addr4,
 			    &dyn->pfid_mask4, a, AF_INET));
 		default:
 			return (pfr_match_addr(dyn->pfid_kt, a, AF_INET));
 		}
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		switch (dyn->pfid_acnt6) {
 		case 0:
 			return (0);
 		case 1:
 			return (PF_MATCHA(0, &dyn->pfid_addr6,
 			    &dyn->pfid_mask6, a, AF_INET6));
 		default:
 			return (pfr_match_addr(dyn->pfid_kt, a, AF_INET6));
 		}
 		break;
 #endif /* INET6 */
 	default:
 		return (0);
 	}
 }
 
 int
 pfi_dynaddr_setup(struct pf_addr_wrap *aw, sa_family_t af)
 {
 	struct epoch_tracker	 et;
 	struct pfi_dynaddr	*dyn;
 	char			 tblname[PF_TABLE_NAME_SIZE];
 	struct pf_kruleset	*ruleset = NULL;
 	struct pfi_kkif		*kif;
 	int			 rv = 0;
 
 	PF_RULES_WASSERT();
 	KASSERT(aw->type == PF_ADDR_DYNIFTL, ("%s: type %u",
 	    __func__, aw->type));
 	KASSERT(aw->p.dyn == NULL, ("%s: dyn is %p", __func__, aw->p.dyn));
 
 	if ((dyn = malloc(sizeof(*dyn), PFI_MTYPE, M_NOWAIT | M_ZERO)) == NULL)
 		return (ENOMEM);
 
 	if ((kif = pf_kkif_create(M_NOWAIT)) == NULL) {
 		free(dyn, PFI_MTYPE);
 		return (ENOMEM);
 	}
 
 	if (!strcmp(aw->v.ifname, "self"))
 		dyn->pfid_kif = pfi_kkif_attach(kif, IFG_ALL);
 	else
 		dyn->pfid_kif = pfi_kkif_attach(kif, aw->v.ifname);
 	kif = NULL;
 	pfi_kkif_ref(dyn->pfid_kif);
 
 	dyn->pfid_net = pfi_unmask(&aw->v.a.mask);
 	if (af == AF_INET && dyn->pfid_net == 32)
 		dyn->pfid_net = 128;
 	strlcpy(tblname, aw->v.ifname, sizeof(tblname));
 	if (aw->iflags & PFI_AFLAG_NETWORK)
 		strlcat(tblname, ":network", sizeof(tblname));
 	if (aw->iflags & PFI_AFLAG_BROADCAST)
 		strlcat(tblname, ":broadcast", sizeof(tblname));
 	if (aw->iflags & PFI_AFLAG_PEER)
 		strlcat(tblname, ":peer", sizeof(tblname));
 	if (aw->iflags & PFI_AFLAG_NOALIAS)
 		strlcat(tblname, ":0", sizeof(tblname));
 	if (dyn->pfid_net != 128)
 		snprintf(tblname + strlen(tblname),
 		    sizeof(tblname) - strlen(tblname), "/%d", dyn->pfid_net);
 	if ((ruleset = pf_find_or_create_kruleset(PF_RESERVED_ANCHOR)) == NULL) {
 		rv = ENOMEM;
 		goto _bad;
 	}
 
 	if ((dyn->pfid_kt = pfr_attach_table(ruleset, tblname)) == NULL) {
 		rv = ENOMEM;
 		goto _bad;
 	}
 
 	dyn->pfid_kt->pfrkt_flags |= PFR_TFLAG_ACTIVE;
 	dyn->pfid_iflags = aw->iflags;
 	dyn->pfid_af = af;
 
 	TAILQ_INSERT_TAIL(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry);
 	aw->p.dyn = dyn;
 	NET_EPOCH_ENTER(et);
 	pfi_kkif_update(dyn->pfid_kif);
 	NET_EPOCH_EXIT(et);
 
 	return (0);
 
 _bad:
 	if (dyn->pfid_kt != NULL)
 		pfr_detach_table(dyn->pfid_kt);
 	if (ruleset != NULL)
 		pf_remove_if_empty_kruleset(ruleset);
 	pfi_kkif_unref(dyn->pfid_kif);
 	free(dyn, PFI_MTYPE);
 
 	return (rv);
 }
 
 static void
 pfi_kkif_update(struct pfi_kkif *kif)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_member	*ifgm;
 	struct pfi_dynaddr	*p;
 	struct pfi_kkif		*tmpkif;
 
 	NET_EPOCH_ASSERT();
 	PF_RULES_WASSERT();
 
 	/* update all dynaddr */
 	TAILQ_FOREACH(p, &kif->pfik_dynaddrs, entry)
 		pfi_dynaddr_update(p);
 
 	/* Apply group flags to new members. */
 	if (kif->pfik_group != NULL) {
 		CK_STAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members,
 		    ifgm_next) {
 			tmpkif = (struct pfi_kkif *)ifgm->ifgm_ifp->if_pf_kif;
 			if (tmpkif == NULL)
 				continue;
 
 			tmpkif->pfik_flags |= kif->pfik_flags;
 		}
 	}
 
 	/* again for all groups kif is member of */
 	if (kif->pfik_ifp != NULL) {
 		CK_STAILQ_FOREACH(ifgl, &kif->pfik_ifp->if_groups, ifgl_next)
 			pfi_kkif_update((struct pfi_kkif *)
 			    ifgl->ifgl_group->ifg_pf_kif);
 	}
 }
 
 static void
 pfi_dynaddr_update(struct pfi_dynaddr *dyn)
 {
 	struct pfi_kkif		*kif;
 	struct pfr_ktable	*kt;
 
 	PF_RULES_WASSERT();
 	KASSERT(dyn && dyn->pfid_kif && dyn->pfid_kt,
 	    ("%s: bad argument", __func__));
 
 	kif = dyn->pfid_kif;
 	kt = dyn->pfid_kt;
 
 	if (kt->pfrkt_larg != V_pfi_update) {
 		/* this table needs to be brought up-to-date */
 		pfi_table_update(kt, kif, dyn->pfid_net, dyn->pfid_iflags);
 		kt->pfrkt_larg = V_pfi_update;
 	}
 	pfr_dynaddr_update(kt, dyn);
 }
 
 static void
 pfi_table_update(struct pfr_ktable *kt, struct pfi_kkif *kif, int net, int flags)
 {
 	int			 e, size2 = 0;
 	struct ifg_member	*ifgm;
 
 	NET_EPOCH_ASSERT();
 
 	V_pfi_buffer_cnt = 0;
 
 	if (kif->pfik_ifp != NULL)
 		pfi_instance_add(kif->pfik_ifp, net, flags);
 	else if (kif->pfik_group != NULL) {
 		CK_STAILQ_FOREACH(ifgm, &kif->pfik_group->ifg_members, ifgm_next)
 			pfi_instance_add(ifgm->ifgm_ifp, net, flags);
 	}
 
 	if ((e = pfr_set_addrs(&kt->pfrkt_t, V_pfi_buffer, V_pfi_buffer_cnt, &size2,
 	    NULL, NULL, NULL, 0, PFR_TFLAG_ALLMASK)))
 		printf("%s: cannot set %d new addresses into table %s: %d\n",
 		    __func__, V_pfi_buffer_cnt, kt->pfrkt_name, e);
 }
 
 static void
 pfi_instance_add(struct ifnet *ifp, int net, int flags)
 {
 	struct ifaddr	*ia;
 	int		 got4 = 0, got6 = 0;
 	int		 net2, af;
 
 	NET_EPOCH_ASSERT();
 
 	CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
 		if (ia->ifa_addr == NULL)
 			continue;
 		af = ia->ifa_addr->sa_family;
 		if (af != AF_INET && af != AF_INET6)
 			continue;
 		/*
 		 * XXX: For point-to-point interfaces, (ifname:0) and IPv4,
 		 *      jump over addresses without a proper route to work
 		 *      around a problem with ppp not fully removing the
 		 *      address used during IPCP.
 		 */
 		if ((ifp->if_flags & IFF_POINTOPOINT) &&
 		    !(ia->ifa_flags & IFA_ROUTE) &&
 		    (flags & PFI_AFLAG_NOALIAS) && (af == AF_INET))
 			continue;
 		if ((flags & PFI_AFLAG_BROADCAST) && af == AF_INET6)
 			continue;
 		if ((flags & PFI_AFLAG_BROADCAST) &&
 		    !(ifp->if_flags & IFF_BROADCAST))
 			continue;
 		if ((flags & PFI_AFLAG_PEER) &&
 		    !(ifp->if_flags & IFF_POINTOPOINT))
 			continue;
 		if ((flags & (PFI_AFLAG_NETWORK | PFI_AFLAG_NOALIAS)) &&
 		    af == AF_INET6 &&
 		    IN6_IS_ADDR_LINKLOCAL(
 		    &((struct sockaddr_in6 *)ia->ifa_addr)->sin6_addr))
 			continue;
 		if (flags & PFI_AFLAG_NOALIAS) {
 			if (af == AF_INET && got4)
 				continue;
 			if (af == AF_INET6 && got6)
 				continue;
 		}
 		if (af == AF_INET)
 			got4 = 1;
 		else if (af == AF_INET6)
 			got6 = 1;
 		net2 = net;
 		if (net2 == 128 && (flags & PFI_AFLAG_NETWORK)) {
 			if (af == AF_INET)
 				net2 = pfi_unmask(&((struct sockaddr_in *)
 				    ia->ifa_netmask)->sin_addr);
 			else if (af == AF_INET6)
 				net2 = pfi_unmask(&((struct sockaddr_in6 *)
 				    ia->ifa_netmask)->sin6_addr);
 		}
 		if (af == AF_INET && net2 > 32)
 			net2 = 32;
 		if (flags & PFI_AFLAG_BROADCAST)
 			pfi_address_add(ia->ifa_broadaddr, af, net2);
 		else if (flags & PFI_AFLAG_PEER)
 			pfi_address_add(ia->ifa_dstaddr, af, net2);
 		else
 			pfi_address_add(ia->ifa_addr, af, net2);
 	}
 }
 
 static void
 pfi_address_add(struct sockaddr *sa, int af, int net)
 {
 	struct pfr_addr	*p;
 	int		 i;
 
 	if (V_pfi_buffer_cnt >= V_pfi_buffer_max) {
 		int		 new_max = V_pfi_buffer_max * 2;
 
 		if (new_max > PFI_BUFFER_MAX) {
 			printf("%s: address buffer full (%d/%d)\n", __func__,
 			    V_pfi_buffer_cnt, PFI_BUFFER_MAX);
 			return;
 		}
 		p = malloc(new_max * sizeof(*V_pfi_buffer), PFI_MTYPE,
 		    M_NOWAIT);
 		if (p == NULL) {
 			printf("%s: no memory to grow buffer (%d/%d)\n",
 			    __func__, V_pfi_buffer_cnt, PFI_BUFFER_MAX);
 			return;
 		}
 		memcpy(p, V_pfi_buffer, V_pfi_buffer_max * sizeof(*V_pfi_buffer));
 		/* no need to zero buffer */
 		free(V_pfi_buffer, PFI_MTYPE);
 		V_pfi_buffer = p;
 		V_pfi_buffer_max = new_max;
 	}
 	if (af == AF_INET && net > 32)
 		net = 128;
 	p = V_pfi_buffer + V_pfi_buffer_cnt++;
 	memset(p, 0, sizeof(*p));
 	p->pfra_af = af;
 	p->pfra_net = net;
 	if (af == AF_INET)
 		p->pfra_ip4addr = ((struct sockaddr_in *)sa)->sin_addr;
 	else if (af == AF_INET6) {
 		p->pfra_ip6addr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 		if (IN6_IS_SCOPE_EMBED(&p->pfra_ip6addr))
 			p->pfra_ip6addr.s6_addr16[1] = 0;
 	}
 	/* mask network address bits */
 	if (net < 128)
 		((caddr_t)p)[p->pfra_net/8] &= ~(0xFF >> (p->pfra_net%8));
 	for (i = (p->pfra_net+7)/8; i < sizeof(p->pfra_u); i++)
 		((caddr_t)p)[i] = 0;
 }
 
 void
 pfi_dynaddr_remove(struct pfi_dynaddr *dyn)
 {
 
 	KASSERT(dyn->pfid_kif != NULL, ("%s: null pfid_kif", __func__));
 	KASSERT(dyn->pfid_kt != NULL, ("%s: null pfid_kt", __func__));
 
 	TAILQ_REMOVE(&dyn->pfid_kif->pfik_dynaddrs, dyn, entry);
 	pfi_kkif_unref(dyn->pfid_kif);
 	pfr_detach_table(dyn->pfid_kt);
 	free(dyn, PFI_MTYPE);
 }
 
 void
 pfi_dynaddr_copyout(struct pf_addr_wrap *aw)
 {
 
 	KASSERT(aw->type == PF_ADDR_DYNIFTL,
 	    ("%s: type %u", __func__, aw->type));
 
 	if (aw->p.dyn == NULL || aw->p.dyn->pfid_kif == NULL)
 		return;
 	aw->p.dyncnt = aw->p.dyn->pfid_acnt4 + aw->p.dyn->pfid_acnt6;
 }
 
 static int
 pfi_kkif_compare(struct pfi_kkif *p, struct pfi_kkif *q)
 {
 	return (strncmp(p->pfik_name, q->pfik_name, IFNAMSIZ));
 }
 
 void
 pfi_update_status(const char *name, struct pf_status *pfs)
 {
 	struct pfi_kkif		*p;
 	struct pfi_kif_cmp	 key;
 	struct ifg_member	 p_member, *ifgm;
 	CK_STAILQ_HEAD(, ifg_member) ifg_members;
 	int			 i, j, k;
 
 	if (pfs) {
 		memset(pfs->pcounters, 0, sizeof(pfs->pcounters));
 		memset(pfs->bcounters, 0, sizeof(pfs->bcounters));
 	}
 
 	strlcpy(key.pfik_name, name, sizeof(key.pfik_name));
 	p = RB_FIND(pfi_ifhead, &V_pfi_ifs, (struct pfi_kkif *)&key);
 	if (p == NULL) {
 		return;
 	}
 
 	if (p->pfik_group != NULL) {
 		memcpy(&ifg_members, &p->pfik_group->ifg_members,
 		    sizeof(ifg_members));
 	} else {
 		/* build a temporary list for p only */
 		memset(&p_member, 0, sizeof(p_member));
 		p_member.ifgm_ifp = p->pfik_ifp;
 		CK_STAILQ_INIT(&ifg_members);
 		CK_STAILQ_INSERT_TAIL(&ifg_members, &p_member, ifgm_next);
 	}
 	CK_STAILQ_FOREACH(ifgm, &ifg_members, ifgm_next) {
 		if (ifgm->ifgm_ifp == NULL || ifgm->ifgm_ifp->if_pf_kif == NULL)
 			continue;
 		p = (struct pfi_kkif *)ifgm->ifgm_ifp->if_pf_kif;
 
 		/* just clear statistics */
 		if (pfs == NULL) {
 			pf_kkif_zero(p);
 			continue;
 		}
 		for (i = 0; i < 2; i++)
 			for (j = 0; j < 2; j++)
 				for (k = 0; k < 2; k++) {
 					pfs->pcounters[i][j][k] +=
 					    pf_counter_u64_fetch(&p->pfik_packets[i][j][k]);
 					pfs->bcounters[i][j] +=
 					    pf_counter_u64_fetch(&p->pfik_bytes[i][j][k]);
 				}
 	}
 }
 
 static void
 pf_kkif_to_kif(struct pfi_kkif *kkif, struct pfi_kif *kif)
 {
 
 	memset(kif, 0, sizeof(*kif));
 	strlcpy(kif->pfik_name, kkif->pfik_name, sizeof(kif->pfik_name));
 	for (int i = 0; i < 2; i++) {
 		for (int j = 0; j < 2; j++) {
 			for (int k = 0; k < 2; k++) {
 				kif->pfik_packets[i][j][k] =
 				    pf_counter_u64_fetch(&kkif->pfik_packets[i][j][k]);
 				kif->pfik_bytes[i][j][k] =
 				    pf_counter_u64_fetch(&kkif->pfik_bytes[i][j][k]);
 			}
 		}
 	}
 	kif->pfik_flags = kkif->pfik_flags;
 	kif->pfik_tzero = kkif->pfik_tzero;
 	kif->pfik_rulerefs = kkif->pfik_rulerefs;
 	/*
 	 * Userspace relies on this pointer to decide if this is a group or
 	 * not. We don't want to share the actual pointer, because it's
 	 * useless to userspace and leaks kernel memory layout information.
 	 * So instead we provide 0xfeedcode as 'true' and NULL as 'false'.
 	 */
 	kif->pfik_group =
 	    kkif->pfik_group ? (struct ifg_group *)0xfeedc0de : NULL;
 }
 
 void
 pfi_get_ifaces(const char *name, struct pfi_kif *buf, int *size)
 {
 	struct epoch_tracker et;
 	struct pfi_kkif	*p, *nextp;
 	int		 n = 0;
 
 	NET_EPOCH_ENTER(et);
 	for (p = RB_MIN(pfi_ifhead, &V_pfi_ifs); p; p = nextp) {
 		nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p);
 		if (pfi_skip_if(name, p))
 			continue;
 		if (*size <= n++)
 			break;
 		if (!p->pfik_tzero)
 			p->pfik_tzero = time_second;
 		pf_kkif_to_kif(p, buf++);
 		nextp = RB_NEXT(pfi_ifhead, &V_pfi_ifs, p);
 	}
 	*size = n;
 	NET_EPOCH_EXIT(et);
 }
 
 static int
 pfi_skip_if(const char *filter, struct pfi_kkif *p)
 {
 	struct ifg_list *i;
 	int	n;
 
 	NET_EPOCH_ASSERT();
 
 	if (filter == NULL || !*filter)
 		return (0);
 	if (!strcmp(p->pfik_name, filter))
 		return (0);	/* exact match */
 	n = strlen(filter);
 	if (n < 1 || n >= IFNAMSIZ)
 		return (1);	/* sanity check */
 	if (filter[n-1] >= '0' && filter[n-1] <= '9')
 		return (1);	/* group names may not end in a digit */
 	if (p->pfik_ifp == NULL)
 		return (1);
 	CK_STAILQ_FOREACH(i, &p->pfik_ifp->if_groups, ifgl_next)
 		if (!strncmp(i->ifgl_group->ifg_group, filter, IFNAMSIZ))
 			return (0); /* iface is in group "filter" */
 	return (1);
 }
 
 int
 pfi_set_flags(const char *name, int flags)
 {
 	struct epoch_tracker et;
 	struct pfi_kkif	*p, *kif;
 
 	kif = pf_kkif_create(M_NOWAIT);
 	if (kif == NULL)
 		return (ENOMEM);
 
 	NET_EPOCH_ENTER(et);
 
 	kif = pfi_kkif_attach(kif, name);
 
 	RB_FOREACH(p, pfi_ifhead, &V_pfi_ifs) {
 		if (pfi_skip_if(name, p))
 			continue;
 		p->pfik_flags |= flags;
 	}
 	NET_EPOCH_EXIT(et);
 	return (0);
 }
 
 int
 pfi_clear_flags(const char *name, int flags)
 {
 	struct epoch_tracker et;
 	struct pfi_kkif *p, *tmp;
 
 	NET_EPOCH_ENTER(et);
 	RB_FOREACH_SAFE(p, pfi_ifhead, &V_pfi_ifs, tmp) {
 		if (pfi_skip_if(name, p))
 			continue;
 		p->pfik_flags &= ~flags;
 
 		if (p->pfik_ifp == NULL && p->pfik_group == NULL &&
 		    p->pfik_flags == 0 && p->pfik_rulerefs == 0) {
 			/* Delete this kif. */
 			RB_REMOVE(pfi_ifhead, &V_pfi_ifs, p);
 			pf_kkif_free(p);
 		}
 	}
 	NET_EPOCH_EXIT(et);
 	return (0);
 }
 
 /* from pf_print_state.c */
 static int
 pfi_unmask(void *addr)
 {
 	struct pf_addr *m = addr;
 	int i = 31, j = 0, b = 0;
 	u_int32_t tmp;
 
 	while (j < 4 && m->addr32[j] == 0xffffffff) {
 		b += 32;
 		j++;
 	}
 	if (j < 4) {
 		tmp = ntohl(m->addr32[j]);
 		for (i = 31; tmp & (1 << i); --i)
 			b++;
 	}
 	return (b);
 }
 
 static void
 pfi_attach_ifnet_event(void *arg __unused, struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct pfi_kkif *kif;
 
 	if (V_pf_vnet_active == 0) {
 		/* Avoid teardown race in the least expensive way. */
 		return;
 	}
 	kif = pf_kkif_create(M_NOWAIT);
 	NET_EPOCH_ENTER(et);
 	PF_RULES_WLOCK();
 	pfi_attach_ifnet(ifp, kif);
 #ifdef ALTQ
 	pf_altq_ifnet_event(ifp, 0);
 #endif
 	PF_RULES_WUNLOCK();
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 pfi_detach_ifnet_event(void *arg __unused, struct ifnet *ifp)
 {
 	struct epoch_tracker et;
 	struct pfi_kkif *kif = (struct pfi_kkif *)ifp->if_pf_kif;
 
 	if (pfsync_detach_ifnet_ptr)
 		pfsync_detach_ifnet_ptr(ifp);
 
 	if (kif == NULL)
 		return;
 
 	if (V_pf_vnet_active == 0) {
 		/* Avoid teardown race in the least expensive way. */
 		return;
 	}
 
 	NET_EPOCH_ENTER(et);
 	PF_RULES_WLOCK();
 	V_pfi_update++;
 	pfi_kkif_update(kif);
 
 	if (kif->pfik_ifp)
 		if_rele(kif->pfik_ifp);
 
 	kif->pfik_ifp = NULL;
 	ifp->if_pf_kif = NULL;
 #ifdef ALTQ
 	pf_altq_ifnet_event(ifp, 1);
 #endif
 	pfi_kkif_remove_if_unref(kif);
 
 	PF_RULES_WUNLOCK();
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 pfi_attach_group_event(void *arg __unused, struct ifg_group *ifg)
 {
 	struct epoch_tracker et;
 	struct pfi_kkif *kif;
 
 	if (V_pf_vnet_active == 0) {
 		/* Avoid teardown race in the least expensive way. */
 		return;
 	}
 	kif = pf_kkif_create(M_WAITOK);
 	NET_EPOCH_ENTER(et);
 	PF_RULES_WLOCK();
 	pfi_attach_ifgroup(ifg, kif);
 	PF_RULES_WUNLOCK();
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 pfi_change_group_event(void *arg __unused, char *gname)
 {
 	struct epoch_tracker et;
 	struct pfi_kkif *kif;
 
 	if (V_pf_vnet_active == 0) {
 		/* Avoid teardown race in the least expensive way. */
 		return;
 	}
 
 	kif = pf_kkif_create(M_WAITOK);
 	NET_EPOCH_ENTER(et);
 	PF_RULES_WLOCK();
 	V_pfi_update++;
 	kif = pfi_kkif_attach(kif, gname);
 	pfi_kkif_update(kif);
 	PF_RULES_WUNLOCK();
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 pfi_detach_group_event(void *arg __unused, struct ifg_group *ifg)
 {
 	struct pfi_kkif *kif = (struct pfi_kkif *)ifg->ifg_pf_kif;
 
 	if (kif == NULL)
 		return;
 
 	if (V_pf_vnet_active == 0) {
 		/* Avoid teardown race in the least expensive way. */
 		return;
 	}
 	PF_RULES_WLOCK();
 	V_pfi_update++;
 
 	kif->pfik_group = NULL;
 	ifg->ifg_pf_kif = NULL;
 
 	pfi_kkif_remove_if_unref(kif);
 	PF_RULES_WUNLOCK();
 }
 
 static void
 pfi_ifaddr_event(void *arg __unused, struct ifnet *ifp)
 {
 
 	KASSERT(ifp, ("ifp == NULL"));
 
 	if (ifp->if_pf_kif == NULL)
 		return;
 
 	if (V_pf_vnet_active == 0) {
 		/* Avoid teardown race in the least expensive way. */
 		return;
 	}
 	PF_RULES_WLOCK();
 	if (ifp->if_pf_kif) {
 		struct epoch_tracker et;
 
 		V_pfi_update++;
 		NET_EPOCH_ENTER(et);
 		pfi_kkif_update(ifp->if_pf_kif);
 		NET_EPOCH_EXIT(et);
 	}
 	PF_RULES_WUNLOCK();
 }
diff --git a/sys/netpfil/pf/pf_ioctl.c b/sys/netpfil/pf/pf_ioctl.c
index dc62388f8da4..76742aebf01a 100644
--- a/sys/netpfil/pf/pf_ioctl.c
+++ b/sys/netpfil/pf/pf_ioctl.c
@@ -1,6864 +1,6865 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2001 Daniel Hartmeier
  * Copyright (c) 2002,2003 Henning Brauer
  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  *    - Redistributions of source code must retain the above copyright
  *      notice, this list of conditions and the following disclaimer.
  *    - Redistributions in binary form must reproduce the above
  *      copyright notice, this list of conditions and the following
  *      disclaimer in the documentation and/or other materials provided
  *      with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Effort sponsored in part by the Defense Advanced Research Projects
  * Agency (DARPA) and Air Force Research Laboratory, Air Force
  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
  *
  *	$OpenBSD: pf_ioctl.c,v 1.213 2009/02/15 21:46:12 mbalmer Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_bpf.h"
 #include "opt_pf.h"
 
 #include <sys/param.h>
 #include <sys/_bitset.h>
 #include <sys/bitset.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/endian.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/hash.h>
 #include <sys/interrupt.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/nv.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/md5.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
+#include <net/if_private.h>
 #include <net/vnet.h>
 #include <net/route.h>
 #include <net/pfil.h>
 #include <net/pfvar.h>
 #include <net/if_pfsync.h>
 #include <net/if_pflog.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/ip_icmp.h>
 #include <netpfil/pf/pf_nv.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif /* INET6 */
 
 #ifdef ALTQ
 #include <net/altq/altq.h>
 #endif
 
 SDT_PROBE_DEFINE3(pf, ioctl, ioctl, error, "int", "int", "int");
 SDT_PROBE_DEFINE3(pf, ioctl, function, error, "char *", "int", "int");
 SDT_PROBE_DEFINE2(pf, ioctl, addrule, error, "int", "int");
 SDT_PROBE_DEFINE2(pf, ioctl, nvchk, error, "int", "int");
 
 static struct pf_kpool	*pf_get_kpool(const char *, u_int32_t, u_int8_t,
 			    u_int32_t, u_int8_t, u_int8_t, u_int8_t);
 
 static void		 pf_mv_kpool(struct pf_kpalist *, struct pf_kpalist *);
 static void		 pf_empty_kpool(struct pf_kpalist *);
 static int		 pfioctl(struct cdev *, u_long, caddr_t, int,
 			    struct thread *);
 static int		 pf_begin_eth(uint32_t *, const char *);
 static void		 pf_rollback_eth_cb(struct epoch_context *);
 static int		 pf_rollback_eth(uint32_t, const char *);
 static int		 pf_commit_eth(uint32_t, const char *);
 static void		 pf_free_eth_rule(struct pf_keth_rule *);
 #ifdef ALTQ
 static int		 pf_begin_altq(u_int32_t *);
 static int		 pf_rollback_altq(u_int32_t);
 static int		 pf_commit_altq(u_int32_t);
 static int		 pf_enable_altq(struct pf_altq *);
 static int		 pf_disable_altq(struct pf_altq *);
 static uint16_t		 pf_qname2qid(const char *);
 static void		 pf_qid_unref(uint16_t);
 #endif /* ALTQ */
 static int		 pf_begin_rules(u_int32_t *, int, const char *);
 static int		 pf_rollback_rules(u_int32_t, int, char *);
 static int		 pf_setup_pfsync_matching(struct pf_kruleset *);
 static void		 pf_hash_rule_rolling(MD5_CTX *, struct pf_krule *);
 static void		 pf_hash_rule(struct pf_krule *);
 static void		 pf_hash_rule_addr(MD5_CTX *, struct pf_rule_addr *);
 static int		 pf_commit_rules(u_int32_t, int, char *);
 static int		 pf_addr_setup(struct pf_kruleset *,
 			    struct pf_addr_wrap *, sa_family_t);
 static void		 pf_addr_copyout(struct pf_addr_wrap *);
 static void		 pf_src_node_copy(const struct pf_ksrc_node *,
 			    struct pf_src_node *);
 #ifdef ALTQ
 static int		 pf_export_kaltq(struct pf_altq *,
 			    struct pfioc_altq_v1 *, size_t);
 static int		 pf_import_kaltq(struct pfioc_altq_v1 *,
 			    struct pf_altq *, size_t);
 #endif /* ALTQ */
 
 VNET_DEFINE(struct pf_krule,	pf_default_rule);
 
 static __inline int             pf_krule_compare(struct pf_krule *,
 				    struct pf_krule *);
 
 RB_GENERATE(pf_krule_global, pf_krule, entry_global, pf_krule_compare);
 
 #ifdef ALTQ
 VNET_DEFINE_STATIC(int,		pf_altq_running);
 #define	V_pf_altq_running	VNET(pf_altq_running)
 #endif
 
 #define	TAGID_MAX	 50000
 struct pf_tagname {
 	TAILQ_ENTRY(pf_tagname)	namehash_entries;
 	TAILQ_ENTRY(pf_tagname)	taghash_entries;
 	char			name[PF_TAG_NAME_SIZE];
 	uint16_t		tag;
 	int			ref;
 };
 
 struct pf_tagset {
 	TAILQ_HEAD(, pf_tagname)	*namehash;
 	TAILQ_HEAD(, pf_tagname)	*taghash;
 	unsigned int			 mask;
 	uint32_t			 seed;
 	BITSET_DEFINE(, TAGID_MAX)	 avail;
 };
 
 VNET_DEFINE(struct pf_tagset, pf_tags);
 #define	V_pf_tags	VNET(pf_tags)
 static unsigned int	pf_rule_tag_hashsize;
 #define	PF_RULE_TAG_HASH_SIZE_DEFAULT	128
 SYSCTL_UINT(_net_pf, OID_AUTO, rule_tag_hashsize, CTLFLAG_RDTUN,
     &pf_rule_tag_hashsize, PF_RULE_TAG_HASH_SIZE_DEFAULT,
     "Size of pf(4) rule tag hashtable");
 
 #ifdef ALTQ
 VNET_DEFINE(struct pf_tagset, pf_qids);
 #define	V_pf_qids	VNET(pf_qids)
 static unsigned int	pf_queue_tag_hashsize;
 #define	PF_QUEUE_TAG_HASH_SIZE_DEFAULT	128
 SYSCTL_UINT(_net_pf, OID_AUTO, queue_tag_hashsize, CTLFLAG_RDTUN,
     &pf_queue_tag_hashsize, PF_QUEUE_TAG_HASH_SIZE_DEFAULT,
     "Size of pf(4) queue tag hashtable");
 #endif
 VNET_DEFINE(uma_zone_t,	 pf_tag_z);
 #define	V_pf_tag_z		 VNET(pf_tag_z)
 static MALLOC_DEFINE(M_PFALTQ, "pf_altq", "pf(4) altq configuration db");
 static MALLOC_DEFINE(M_PFRULE, "pf_rule", "pf(4) rules");
 
 #if (PF_QNAME_SIZE != PF_TAG_NAME_SIZE)
 #error PF_QNAME_SIZE must be equal to PF_TAG_NAME_SIZE
 #endif
 
 static void		 pf_init_tagset(struct pf_tagset *, unsigned int *,
 			    unsigned int);
 static void		 pf_cleanup_tagset(struct pf_tagset *);
 static uint16_t		 tagname2hashindex(const struct pf_tagset *, const char *);
 static uint16_t		 tag2hashindex(const struct pf_tagset *, uint16_t);
 static u_int16_t	 tagname2tag(struct pf_tagset *, const char *);
 static u_int16_t	 pf_tagname2tag(const char *);
 static void		 tag_unref(struct pf_tagset *, u_int16_t);
 
 #define DPFPRINTF(n, x) if (V_pf_status.debug >= (n)) printf x
 
 struct cdev *pf_dev;
 
 /*
  * XXX - These are new and need to be checked when moveing to a new version
  */
 static void		 pf_clear_all_states(void);
 static unsigned int	 pf_clear_states(const struct pf_kstate_kill *);
 static void		 pf_killstates(struct pf_kstate_kill *,
 			    unsigned int *);
 static int		 pf_killstates_row(struct pf_kstate_kill *,
 			    struct pf_idhash *);
 static int		 pf_killstates_nv(struct pfioc_nv *);
 static int		 pf_clearstates_nv(struct pfioc_nv *);
 static int		 pf_getstate(struct pfioc_nv *);
 static int		 pf_getstatus(struct pfioc_nv *);
 static int		 pf_clear_tables(void);
 static void		 pf_clear_srcnodes(struct pf_ksrc_node *);
 static void		 pf_kill_srcnodes(struct pfioc_src_node_kill *);
 static int		 pf_keepcounters(struct pfioc_nv *);
 static void		 pf_tbladdr_copyout(struct pf_addr_wrap *);
 
 /*
  * Wrapper functions for pfil(9) hooks
  */
 static pfil_return_t pf_eth_check_in(struct mbuf **m, struct ifnet *ifp,
     int flags, void *ruleset __unused, struct inpcb *inp);
 static pfil_return_t pf_eth_check_out(struct mbuf **m, struct ifnet *ifp,
     int flags, void *ruleset __unused, struct inpcb *inp);
 #ifdef INET
 static pfil_return_t pf_check_in(struct mbuf **m, struct ifnet *ifp,
     int flags, void *ruleset __unused, struct inpcb *inp);
 static pfil_return_t pf_check_out(struct mbuf **m, struct ifnet *ifp,
     int flags, void *ruleset __unused, struct inpcb *inp);
 #endif
 #ifdef INET6
 static pfil_return_t pf_check6_in(struct mbuf **m, struct ifnet *ifp,
     int flags, void *ruleset __unused, struct inpcb *inp);
 static pfil_return_t pf_check6_out(struct mbuf **m, struct ifnet *ifp,
     int flags, void *ruleset __unused, struct inpcb *inp);
 #endif
 
 static void		hook_pf_eth(void);
 static void		hook_pf(void);
 static void		dehook_pf_eth(void);
 static void		dehook_pf(void);
 static int		shutdown_pf(void);
 static int		pf_load(void);
 static void		pf_unload(void);
 
 static struct cdevsw pf_cdevsw = {
 	.d_ioctl =	pfioctl,
 	.d_name =	PF_NAME,
 	.d_version =	D_VERSION,
 };
 
 VNET_DEFINE_STATIC(bool, pf_pfil_hooked);
 #define V_pf_pfil_hooked	VNET(pf_pfil_hooked)
 VNET_DEFINE_STATIC(bool, pf_pfil_eth_hooked);
 #define V_pf_pfil_eth_hooked	VNET(pf_pfil_eth_hooked)
 
 /*
  * We need a flag that is neither hooked nor running to know when
  * the VNET is "valid".  We primarily need this to control (global)
  * external event, e.g., eventhandlers.
  */
 VNET_DEFINE(int, pf_vnet_active);
 #define V_pf_vnet_active	VNET(pf_vnet_active)
 
 int pf_end_threads;
 struct proc *pf_purge_proc;
 
 struct rmlock			pf_rules_lock;
 struct sx			pf_ioctl_lock;
 struct sx			pf_end_lock;
 
 /* pfsync */
 VNET_DEFINE(pfsync_state_import_t *, pfsync_state_import_ptr);
 VNET_DEFINE(pfsync_insert_state_t *, pfsync_insert_state_ptr);
 VNET_DEFINE(pfsync_update_state_t *, pfsync_update_state_ptr);
 VNET_DEFINE(pfsync_delete_state_t *, pfsync_delete_state_ptr);
 VNET_DEFINE(pfsync_clear_states_t *, pfsync_clear_states_ptr);
 VNET_DEFINE(pfsync_defer_t *, pfsync_defer_ptr);
 pfsync_detach_ifnet_t *pfsync_detach_ifnet_ptr;
 
 /* pflog */
 pflog_packet_t			*pflog_packet_ptr = NULL;
 
 /*
  * Copy a user-provided string, returning an error if truncation would occur.
  * Avoid scanning past "sz" bytes in the source string since there's no
  * guarantee that it's nul-terminated.
  */
 static int
 pf_user_strcpy(char *dst, const char *src, size_t sz)
 {
 	if (strnlen(src, sz) == sz)
 		return (EINVAL);
 	(void)strlcpy(dst, src, sz);
 	return (0);
 }
 
 static void
 pfattach_vnet(void)
 {
 	u_int32_t *my_timeout = V_pf_default_rule.timeout;
 
 	bzero(&V_pf_status, sizeof(V_pf_status));
 
 	pf_initialize();
 	pfr_initialize();
 	pfi_initialize_vnet();
 	pf_normalize_init();
 	pf_syncookies_init();
 
 	V_pf_limits[PF_LIMIT_STATES].limit = PFSTATE_HIWAT;
 	V_pf_limits[PF_LIMIT_SRC_NODES].limit = PFSNODE_HIWAT;
 
 	RB_INIT(&V_pf_anchors);
 	pf_init_kruleset(&pf_main_ruleset);
 
 	pf_init_keth(V_pf_keth);
 
 	/* default rule should never be garbage collected */
 	V_pf_default_rule.entries.tqe_prev = &V_pf_default_rule.entries.tqe_next;
 #ifdef PF_DEFAULT_TO_DROP
 	V_pf_default_rule.action = PF_DROP;
 #else
 	V_pf_default_rule.action = PF_PASS;
 #endif
 	V_pf_default_rule.nr = -1;
 	V_pf_default_rule.rtableid = -1;
 
 	pf_counter_u64_init(&V_pf_default_rule.evaluations, M_WAITOK);
 	for (int i = 0; i < 2; i++) {
 		pf_counter_u64_init(&V_pf_default_rule.packets[i], M_WAITOK);
 		pf_counter_u64_init(&V_pf_default_rule.bytes[i], M_WAITOK);
 	}
 	V_pf_default_rule.states_cur = counter_u64_alloc(M_WAITOK);
 	V_pf_default_rule.states_tot = counter_u64_alloc(M_WAITOK);
 	V_pf_default_rule.src_nodes = counter_u64_alloc(M_WAITOK);
 
 	V_pf_default_rule.timestamp = uma_zalloc_pcpu(pf_timestamp_pcpu_zone,
 	    M_WAITOK | M_ZERO);
 
 #ifdef PF_WANT_32_TO_64_COUNTER
 	V_pf_kifmarker = malloc(sizeof(*V_pf_kifmarker), PFI_MTYPE, M_WAITOK | M_ZERO);
 	V_pf_rulemarker = malloc(sizeof(*V_pf_rulemarker), M_PFRULE, M_WAITOK | M_ZERO);
 	PF_RULES_WLOCK();
 	LIST_INSERT_HEAD(&V_pf_allkiflist, V_pf_kifmarker, pfik_allkiflist);
 	LIST_INSERT_HEAD(&V_pf_allrulelist, &V_pf_default_rule, allrulelist);
 	V_pf_allrulecount++;
 	LIST_INSERT_HEAD(&V_pf_allrulelist, V_pf_rulemarker, allrulelist);
 	PF_RULES_WUNLOCK();
 #endif
 
 	/* initialize default timeouts */
 	my_timeout[PFTM_TCP_FIRST_PACKET] = PFTM_TCP_FIRST_PACKET_VAL;
 	my_timeout[PFTM_TCP_OPENING] = PFTM_TCP_OPENING_VAL;
 	my_timeout[PFTM_TCP_ESTABLISHED] = PFTM_TCP_ESTABLISHED_VAL;
 	my_timeout[PFTM_TCP_CLOSING] = PFTM_TCP_CLOSING_VAL;
 	my_timeout[PFTM_TCP_FIN_WAIT] = PFTM_TCP_FIN_WAIT_VAL;
 	my_timeout[PFTM_TCP_CLOSED] = PFTM_TCP_CLOSED_VAL;
 	my_timeout[PFTM_UDP_FIRST_PACKET] = PFTM_UDP_FIRST_PACKET_VAL;
 	my_timeout[PFTM_UDP_SINGLE] = PFTM_UDP_SINGLE_VAL;
 	my_timeout[PFTM_UDP_MULTIPLE] = PFTM_UDP_MULTIPLE_VAL;
 	my_timeout[PFTM_ICMP_FIRST_PACKET] = PFTM_ICMP_FIRST_PACKET_VAL;
 	my_timeout[PFTM_ICMP_ERROR_REPLY] = PFTM_ICMP_ERROR_REPLY_VAL;
 	my_timeout[PFTM_OTHER_FIRST_PACKET] = PFTM_OTHER_FIRST_PACKET_VAL;
 	my_timeout[PFTM_OTHER_SINGLE] = PFTM_OTHER_SINGLE_VAL;
 	my_timeout[PFTM_OTHER_MULTIPLE] = PFTM_OTHER_MULTIPLE_VAL;
 	my_timeout[PFTM_FRAG] = PFTM_FRAG_VAL;
 	my_timeout[PFTM_INTERVAL] = PFTM_INTERVAL_VAL;
 	my_timeout[PFTM_SRC_NODE] = PFTM_SRC_NODE_VAL;
 	my_timeout[PFTM_TS_DIFF] = PFTM_TS_DIFF_VAL;
 	my_timeout[PFTM_ADAPTIVE_START] = PFSTATE_ADAPT_START;
 	my_timeout[PFTM_ADAPTIVE_END] = PFSTATE_ADAPT_END;
 
 	V_pf_status.debug = PF_DEBUG_URGENT;
 
 	V_pf_pfil_hooked = false;
 	V_pf_pfil_eth_hooked = false;
 
 	/* XXX do our best to avoid a conflict */
 	V_pf_status.hostid = arc4random();
 
 	for (int i = 0; i < PFRES_MAX; i++)
 		V_pf_status.counters[i] = counter_u64_alloc(M_WAITOK);
 	for (int i = 0; i < KLCNT_MAX; i++)
 		V_pf_status.lcounters[i] = counter_u64_alloc(M_WAITOK);
 	for (int i = 0; i < FCNT_MAX; i++)
 		pf_counter_u64_init(&V_pf_status.fcounters[i], M_WAITOK);
 	for (int i = 0; i < SCNT_MAX; i++)
 		V_pf_status.scounters[i] = counter_u64_alloc(M_WAITOK);
 
 	if (swi_add(&V_pf_swi_ie, "pf send", pf_intr, curvnet, SWI_NET,
 	    INTR_MPSAFE, &V_pf_swi_cookie) != 0)
 		/* XXXGL: leaked all above. */
 		return;
 }
 
 static struct pf_kpool *
 pf_get_kpool(const char *anchor, u_int32_t ticket, u_int8_t rule_action,
     u_int32_t rule_number, u_int8_t r_last, u_int8_t active,
     u_int8_t check_ticket)
 {
 	struct pf_kruleset	*ruleset;
 	struct pf_krule		*rule;
 	int			 rs_num;
 
 	ruleset = pf_find_kruleset(anchor);
 	if (ruleset == NULL)
 		return (NULL);
 	rs_num = pf_get_ruleset_number(rule_action);
 	if (rs_num >= PF_RULESET_MAX)
 		return (NULL);
 	if (active) {
 		if (check_ticket && ticket !=
 		    ruleset->rules[rs_num].active.ticket)
 			return (NULL);
 		if (r_last)
 			rule = TAILQ_LAST(ruleset->rules[rs_num].active.ptr,
 			    pf_krulequeue);
 		else
 			rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr);
 	} else {
 		if (check_ticket && ticket !=
 		    ruleset->rules[rs_num].inactive.ticket)
 			return (NULL);
 		if (r_last)
 			rule = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr,
 			    pf_krulequeue);
 		else
 			rule = TAILQ_FIRST(ruleset->rules[rs_num].inactive.ptr);
 	}
 	if (!r_last) {
 		while ((rule != NULL) && (rule->nr != rule_number))
 			rule = TAILQ_NEXT(rule, entries);
 	}
 	if (rule == NULL)
 		return (NULL);
 
 	return (&rule->rpool);
 }
 
 static void
 pf_mv_kpool(struct pf_kpalist *poola, struct pf_kpalist *poolb)
 {
 	struct pf_kpooladdr	*mv_pool_pa;
 
 	while ((mv_pool_pa = TAILQ_FIRST(poola)) != NULL) {
 		TAILQ_REMOVE(poola, mv_pool_pa, entries);
 		TAILQ_INSERT_TAIL(poolb, mv_pool_pa, entries);
 	}
 }
 
 static void
 pf_empty_kpool(struct pf_kpalist *poola)
 {
 	struct pf_kpooladdr *pa;
 
 	while ((pa = TAILQ_FIRST(poola)) != NULL) {
 		switch (pa->addr.type) {
 		case PF_ADDR_DYNIFTL:
 			pfi_dynaddr_remove(pa->addr.p.dyn);
 			break;
 		case PF_ADDR_TABLE:
 			/* XXX: this could be unfinished pooladdr on pabuf */
 			if (pa->addr.p.tbl != NULL)
 				pfr_detach_table(pa->addr.p.tbl);
 			break;
 		}
 		if (pa->kif)
 			pfi_kkif_unref(pa->kif);
 		TAILQ_REMOVE(poola, pa, entries);
 		free(pa, M_PFRULE);
 	}
 }
 
 static void
 pf_unlink_rule_locked(struct pf_krulequeue *rulequeue, struct pf_krule *rule)
 {
 
 	PF_RULES_WASSERT();
 	PF_UNLNKDRULES_ASSERT();
 
 	TAILQ_REMOVE(rulequeue, rule, entries);
 
 	rule->rule_ref |= PFRULE_REFS;
 	TAILQ_INSERT_TAIL(&V_pf_unlinked_rules, rule, entries);
 }
 
 static void
 pf_unlink_rule(struct pf_krulequeue *rulequeue, struct pf_krule *rule)
 {
 
 	PF_RULES_WASSERT();
 
 	PF_UNLNKDRULES_LOCK();
 	pf_unlink_rule_locked(rulequeue, rule);
 	PF_UNLNKDRULES_UNLOCK();
 }
 
 static void
 pf_free_eth_rule(struct pf_keth_rule *rule)
 {
 	PF_RULES_WASSERT();
 
 	if (rule == NULL)
 		return;
 
 	if (rule->tag)
 		tag_unref(&V_pf_tags, rule->tag);
 	if (rule->match_tag)
 		tag_unref(&V_pf_tags, rule->match_tag);
 #ifdef ALTQ
 	pf_qid_unref(rule->qid);
 #endif
 
 	if (rule->bridge_to)
 		pfi_kkif_unref(rule->bridge_to);
 	if (rule->kif)
 		pfi_kkif_unref(rule->kif);
 
 	if (rule->ipsrc.addr.type == PF_ADDR_TABLE)
 		pfr_detach_table(rule->ipsrc.addr.p.tbl);
 	if (rule->ipdst.addr.type == PF_ADDR_TABLE)
 		pfr_detach_table(rule->ipdst.addr.p.tbl);
 
 	counter_u64_free(rule->evaluations);
 	for (int i = 0; i < 2; i++) {
 		counter_u64_free(rule->packets[i]);
 		counter_u64_free(rule->bytes[i]);
 	}
 	uma_zfree_pcpu(pf_timestamp_pcpu_zone, rule->timestamp);
 	pf_keth_anchor_remove(rule);
 
 	free(rule, M_PFRULE);
 }
 
 void
 pf_free_rule(struct pf_krule *rule)
 {
 
 	PF_RULES_WASSERT();
 	PF_CONFIG_ASSERT();
 
 	if (rule->tag)
 		tag_unref(&V_pf_tags, rule->tag);
 	if (rule->match_tag)
 		tag_unref(&V_pf_tags, rule->match_tag);
 #ifdef ALTQ
 	if (rule->pqid != rule->qid)
 		pf_qid_unref(rule->pqid);
 	pf_qid_unref(rule->qid);
 #endif
 	switch (rule->src.addr.type) {
 	case PF_ADDR_DYNIFTL:
 		pfi_dynaddr_remove(rule->src.addr.p.dyn);
 		break;
 	case PF_ADDR_TABLE:
 		pfr_detach_table(rule->src.addr.p.tbl);
 		break;
 	}
 	switch (rule->dst.addr.type) {
 	case PF_ADDR_DYNIFTL:
 		pfi_dynaddr_remove(rule->dst.addr.p.dyn);
 		break;
 	case PF_ADDR_TABLE:
 		pfr_detach_table(rule->dst.addr.p.tbl);
 		break;
 	}
 	if (rule->overload_tbl)
 		pfr_detach_table(rule->overload_tbl);
 	if (rule->kif)
 		pfi_kkif_unref(rule->kif);
 	pf_kanchor_remove(rule);
 	pf_empty_kpool(&rule->rpool.list);
 
 	pf_krule_free(rule);
 }
 
 static void
 pf_init_tagset(struct pf_tagset *ts, unsigned int *tunable_size,
     unsigned int default_size)
 {
 	unsigned int i;
 	unsigned int hashsize;
 
 	if (*tunable_size == 0 || !powerof2(*tunable_size))
 		*tunable_size = default_size;
 
 	hashsize = *tunable_size;
 	ts->namehash = mallocarray(hashsize, sizeof(*ts->namehash), M_PFHASH,
 	    M_WAITOK);
 	ts->taghash = mallocarray(hashsize, sizeof(*ts->taghash), M_PFHASH,
 	    M_WAITOK);
 	ts->mask = hashsize - 1;
 	ts->seed = arc4random();
 	for (i = 0; i < hashsize; i++) {
 		TAILQ_INIT(&ts->namehash[i]);
 		TAILQ_INIT(&ts->taghash[i]);
 	}
 	BIT_FILL(TAGID_MAX, &ts->avail);
 }
 
 static void
 pf_cleanup_tagset(struct pf_tagset *ts)
 {
 	unsigned int i;
 	unsigned int hashsize;
 	struct pf_tagname *t, *tmp;
 
 	/*
 	 * Only need to clean up one of the hashes as each tag is hashed
 	 * into each table.
 	 */
 	hashsize = ts->mask + 1;
 	for (i = 0; i < hashsize; i++)
 		TAILQ_FOREACH_SAFE(t, &ts->namehash[i], namehash_entries, tmp)
 			uma_zfree(V_pf_tag_z, t);
 
 	free(ts->namehash, M_PFHASH);
 	free(ts->taghash, M_PFHASH);
 }
 
 static uint16_t
 tagname2hashindex(const struct pf_tagset *ts, const char *tagname)
 {
 	size_t len;
 
 	len = strnlen(tagname, PF_TAG_NAME_SIZE - 1);
 	return (murmur3_32_hash(tagname, len, ts->seed) & ts->mask);
 }
 
 static uint16_t
 tag2hashindex(const struct pf_tagset *ts, uint16_t tag)
 {
 
 	return (tag & ts->mask);
 }
 
 static u_int16_t
 tagname2tag(struct pf_tagset *ts, const char *tagname)
 {
 	struct pf_tagname	*tag;
 	u_int32_t		 index;
 	u_int16_t		 new_tagid;
 
 	PF_RULES_WASSERT();
 
 	index = tagname2hashindex(ts, tagname);
 	TAILQ_FOREACH(tag, &ts->namehash[index], namehash_entries)
 		if (strcmp(tagname, tag->name) == 0) {
 			tag->ref++;
 			return (tag->tag);
 		}
 
 	/*
 	 * new entry
 	 *
 	 * to avoid fragmentation, we do a linear search from the beginning
 	 * and take the first free slot we find.
 	 */
 	new_tagid = BIT_FFS(TAGID_MAX, &ts->avail);
 	/*
 	 * Tags are 1-based, with valid tags in the range [1..TAGID_MAX].
 	 * BIT_FFS() returns a 1-based bit number, with 0 indicating no bits
 	 * set.  It may also return a bit number greater than TAGID_MAX due
 	 * to rounding of the number of bits in the vector up to a multiple
 	 * of the vector word size at declaration/allocation time.
 	 */
 	if ((new_tagid == 0) || (new_tagid > TAGID_MAX))
 		return (0);
 
 	/* Mark the tag as in use.  Bits are 0-based for BIT_CLR() */
 	BIT_CLR(TAGID_MAX, new_tagid - 1, &ts->avail);
 
 	/* allocate and fill new struct pf_tagname */
 	tag = uma_zalloc(V_pf_tag_z, M_NOWAIT);
 	if (tag == NULL)
 		return (0);
 	strlcpy(tag->name, tagname, sizeof(tag->name));
 	tag->tag = new_tagid;
 	tag->ref = 1;
 
 	/* Insert into namehash */
 	TAILQ_INSERT_TAIL(&ts->namehash[index], tag, namehash_entries);
 
 	/* Insert into taghash */
 	index = tag2hashindex(ts, new_tagid);
 	TAILQ_INSERT_TAIL(&ts->taghash[index], tag, taghash_entries);
 
 	return (tag->tag);
 }
 
 static void
 tag_unref(struct pf_tagset *ts, u_int16_t tag)
 {
 	struct pf_tagname	*t;
 	uint16_t		 index;
 
 	PF_RULES_WASSERT();
 
 	index = tag2hashindex(ts, tag);
 	TAILQ_FOREACH(t, &ts->taghash[index], taghash_entries)
 		if (tag == t->tag) {
 			if (--t->ref == 0) {
 				TAILQ_REMOVE(&ts->taghash[index], t,
 				    taghash_entries);
 				index = tagname2hashindex(ts, t->name);
 				TAILQ_REMOVE(&ts->namehash[index], t,
 				    namehash_entries);
 				/* Bits are 0-based for BIT_SET() */
 				BIT_SET(TAGID_MAX, tag - 1, &ts->avail);
 				uma_zfree(V_pf_tag_z, t);
 			}
 			break;
 		}
 }
 
 static uint16_t
 pf_tagname2tag(const char *tagname)
 {
 	return (tagname2tag(&V_pf_tags, tagname));
 }
 
 static int
 pf_begin_eth(uint32_t *ticket, const char *anchor)
 {
 	struct pf_keth_rule *rule, *tmp;
 	struct pf_keth_ruleset *rs;
 
 	PF_RULES_WASSERT();
 
 	rs = pf_find_or_create_keth_ruleset(anchor);
 	if (rs == NULL)
 		return (EINVAL);
 
 	/* Purge old inactive rules. */
 	TAILQ_FOREACH_SAFE(rule, rs->inactive.rules, entries,
 	    tmp) {
 		TAILQ_REMOVE(rs->inactive.rules, rule,
 		    entries);
 		pf_free_eth_rule(rule);
 	}
 
 	*ticket = ++rs->inactive.ticket;
 	rs->inactive.open = 1;
 
 	return (0);
 }
 
 static void
 pf_rollback_eth_cb(struct epoch_context *ctx)
 {
 	struct pf_keth_ruleset *rs;
 
 	rs = __containerof(ctx, struct pf_keth_ruleset, epoch_ctx);
 
 	CURVNET_SET(rs->vnet);
 
 	PF_RULES_WLOCK();
 	pf_rollback_eth(rs->inactive.ticket,
 	    rs->anchor ? rs->anchor->path : "");
 	PF_RULES_WUNLOCK();
 
 	CURVNET_RESTORE();
 }
 
 static int
 pf_rollback_eth(uint32_t ticket, const char *anchor)
 {
 	struct pf_keth_rule *rule, *tmp;
 	struct pf_keth_ruleset *rs;
 
 	PF_RULES_WASSERT();
 
 	rs = pf_find_keth_ruleset(anchor);
 	if (rs == NULL)
 		return (EINVAL);
 
 	if (!rs->inactive.open ||
 	    ticket != rs->inactive.ticket)
 		return (0);
 
 	/* Purge old inactive rules. */
 	TAILQ_FOREACH_SAFE(rule, rs->inactive.rules, entries,
 	    tmp) {
 		TAILQ_REMOVE(rs->inactive.rules, rule, entries);
 		pf_free_eth_rule(rule);
 	}
 
 	rs->inactive.open = 0;
 
 	pf_remove_if_empty_keth_ruleset(rs);
 
 	return (0);
 }
 
 #define	PF_SET_SKIP_STEPS(i)					\
 	do {							\
 		while (head[i] != cur) {			\
 			head[i]->skip[i].ptr = cur;		\
 			head[i] = TAILQ_NEXT(head[i], entries);	\
 		}						\
 	} while (0)
 
 static void
 pf_eth_calc_skip_steps(struct pf_keth_ruleq *rules)
 {
 	struct pf_keth_rule *cur, *prev, *head[PFE_SKIP_COUNT];
 	int i;
 
 	cur = TAILQ_FIRST(rules);
 	prev = cur;
 	for (i = 0; i < PFE_SKIP_COUNT; ++i)
 		head[i] = cur;
 	while (cur != NULL) {
 		if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
 			PF_SET_SKIP_STEPS(PFE_SKIP_IFP);
 		if (cur->direction != prev->direction)
 			PF_SET_SKIP_STEPS(PFE_SKIP_DIR);
 		if (cur->proto != prev->proto)
 			PF_SET_SKIP_STEPS(PFE_SKIP_PROTO);
 		if (memcmp(&cur->src, &prev->src, sizeof(cur->src)) != 0)
 			PF_SET_SKIP_STEPS(PFE_SKIP_SRC_ADDR);
 		if (memcmp(&cur->dst, &prev->dst, sizeof(cur->dst)) != 0)
 			PF_SET_SKIP_STEPS(PFE_SKIP_DST_ADDR);
 
 		prev = cur;
 		cur = TAILQ_NEXT(cur, entries);
 	}
 	for (i = 0; i < PFE_SKIP_COUNT; ++i)
 		PF_SET_SKIP_STEPS(i);
 }
 
 static int
 pf_commit_eth(uint32_t ticket, const char *anchor)
 {
 	struct pf_keth_ruleq *rules;
 	struct pf_keth_ruleset *rs;
 
 	rs = pf_find_keth_ruleset(anchor);
 	if (rs == NULL) {
 		return (EINVAL);
 	}
 
 	if (!rs->inactive.open ||
 	    ticket != rs->inactive.ticket)
 		return (EBUSY);
 
 	PF_RULES_WASSERT();
 
 	pf_eth_calc_skip_steps(rs->inactive.rules);
 
 	rules = rs->active.rules;
 	ck_pr_store_ptr(&rs->active.rules, rs->inactive.rules);
 	rs->inactive.rules = rules;
 	rs->inactive.ticket = rs->active.ticket;
 
 	/* Clean up inactive rules (i.e. previously active rules), only when
 	 * we're sure they're no longer used. */
 	NET_EPOCH_CALL(pf_rollback_eth_cb, &rs->epoch_ctx);
 
 	return (0);
 }
 
 #ifdef ALTQ
 static uint16_t
 pf_qname2qid(const char *qname)
 {
 	return (tagname2tag(&V_pf_qids, qname));
 }
 
 static void
 pf_qid_unref(uint16_t qid)
 {
 	tag_unref(&V_pf_qids, qid);
 }
 
 static int
 pf_begin_altq(u_int32_t *ticket)
 {
 	struct pf_altq	*altq, *tmp;
 	int		 error = 0;
 
 	PF_RULES_WASSERT();
 
 	/* Purge the old altq lists */
 	TAILQ_FOREACH_SAFE(altq, V_pf_altq_ifs_inactive, entries, tmp) {
 		if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
 			/* detach and destroy the discipline */
 			error = altq_remove(altq);
 		}
 		free(altq, M_PFALTQ);
 	}
 	TAILQ_INIT(V_pf_altq_ifs_inactive);
 	TAILQ_FOREACH_SAFE(altq, V_pf_altqs_inactive, entries, tmp) {
 		pf_qid_unref(altq->qid);
 		free(altq, M_PFALTQ);
 	}
 	TAILQ_INIT(V_pf_altqs_inactive);
 	if (error)
 		return (error);
 	*ticket = ++V_ticket_altqs_inactive;
 	V_altqs_inactive_open = 1;
 	return (0);
 }
 
 static int
 pf_rollback_altq(u_int32_t ticket)
 {
 	struct pf_altq	*altq, *tmp;
 	int		 error = 0;
 
 	PF_RULES_WASSERT();
 
 	if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive)
 		return (0);
 	/* Purge the old altq lists */
 	TAILQ_FOREACH_SAFE(altq, V_pf_altq_ifs_inactive, entries, tmp) {
 		if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
 			/* detach and destroy the discipline */
 			error = altq_remove(altq);
 		}
 		free(altq, M_PFALTQ);
 	}
 	TAILQ_INIT(V_pf_altq_ifs_inactive);
 	TAILQ_FOREACH_SAFE(altq, V_pf_altqs_inactive, entries, tmp) {
 		pf_qid_unref(altq->qid);
 		free(altq, M_PFALTQ);
 	}
 	TAILQ_INIT(V_pf_altqs_inactive);
 	V_altqs_inactive_open = 0;
 	return (error);
 }
 
 static int
 pf_commit_altq(u_int32_t ticket)
 {
 	struct pf_altqqueue	*old_altqs, *old_altq_ifs;
 	struct pf_altq		*altq, *tmp;
 	int			 err, error = 0;
 
 	PF_RULES_WASSERT();
 
 	if (!V_altqs_inactive_open || ticket != V_ticket_altqs_inactive)
 		return (EBUSY);
 
 	/* swap altqs, keep the old. */
 	old_altqs = V_pf_altqs_active;
 	old_altq_ifs = V_pf_altq_ifs_active;
 	V_pf_altqs_active = V_pf_altqs_inactive;
 	V_pf_altq_ifs_active = V_pf_altq_ifs_inactive;
 	V_pf_altqs_inactive = old_altqs;
 	V_pf_altq_ifs_inactive = old_altq_ifs;
 	V_ticket_altqs_active = V_ticket_altqs_inactive;
 
 	/* Attach new disciplines */
 	TAILQ_FOREACH(altq, V_pf_altq_ifs_active, entries) {
 		if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
 			/* attach the discipline */
 			error = altq_pfattach(altq);
 			if (error == 0 && V_pf_altq_running)
 				error = pf_enable_altq(altq);
 			if (error != 0)
 				return (error);
 		}
 	}
 
 	/* Purge the old altq lists */
 	TAILQ_FOREACH_SAFE(altq, V_pf_altq_ifs_inactive, entries, tmp) {
 		if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
 			/* detach and destroy the discipline */
 			if (V_pf_altq_running)
 				error = pf_disable_altq(altq);
 			err = altq_pfdetach(altq);
 			if (err != 0 && error == 0)
 				error = err;
 			err = altq_remove(altq);
 			if (err != 0 && error == 0)
 				error = err;
 		}
 		free(altq, M_PFALTQ);
 	}
 	TAILQ_INIT(V_pf_altq_ifs_inactive);
 	TAILQ_FOREACH_SAFE(altq, V_pf_altqs_inactive, entries, tmp) {
 		pf_qid_unref(altq->qid);
 		free(altq, M_PFALTQ);
 	}
 	TAILQ_INIT(V_pf_altqs_inactive);
 
 	V_altqs_inactive_open = 0;
 	return (error);
 }
 
 static int
 pf_enable_altq(struct pf_altq *altq)
 {
 	struct ifnet		*ifp;
 	struct tb_profile	 tb;
 	int			 error = 0;
 
 	if ((ifp = ifunit(altq->ifname)) == NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd.altq_type != ALTQT_NONE)
 		error = altq_enable(&ifp->if_snd);
 
 	/* set tokenbucket regulator */
 	if (error == 0 && ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		tb.rate = altq->ifbandwidth;
 		tb.depth = altq->tbrsize;
 		error = tbr_set(&ifp->if_snd, &tb);
 	}
 
 	return (error);
 }
 
 static int
 pf_disable_altq(struct pf_altq *altq)
 {
 	struct ifnet		*ifp;
 	struct tb_profile	 tb;
 	int			 error;
 
 	if ((ifp = ifunit(altq->ifname)) == NULL)
 		return (EINVAL);
 
 	/*
 	 * when the discipline is no longer referenced, it was overridden
 	 * by a new one.  if so, just return.
 	 */
 	if (altq->altq_disc != ifp->if_snd.altq_disc)
 		return (0);
 
 	error = altq_disable(&ifp->if_snd);
 
 	if (error == 0) {
 		/* clear tokenbucket regulator */
 		tb.rate = 0;
 		error = tbr_set(&ifp->if_snd, &tb);
 	}
 
 	return (error);
 }
 
 static int
 pf_altq_ifnet_event_add(struct ifnet *ifp, int remove, u_int32_t ticket,
     struct pf_altq *altq)
 {
 	struct ifnet	*ifp1;
 	int		 error = 0;
 
 	/* Deactivate the interface in question */
 	altq->local_flags &= ~PFALTQ_FLAG_IF_REMOVED;
 	if ((ifp1 = ifunit(altq->ifname)) == NULL ||
 	    (remove && ifp1 == ifp)) {
 		altq->local_flags |= PFALTQ_FLAG_IF_REMOVED;
 	} else {
 		error = altq_add(ifp1, altq);
 
 		if (ticket != V_ticket_altqs_inactive)
 			error = EBUSY;
 
 		if (error)
 			free(altq, M_PFALTQ);
 	}
 
 	return (error);
 }
 
 void
 pf_altq_ifnet_event(struct ifnet *ifp, int remove)
 {
 	struct pf_altq	*a1, *a2, *a3;
 	u_int32_t	 ticket;
 	int		 error = 0;
 
 	/*
 	 * No need to re-evaluate the configuration for events on interfaces
 	 * that do not support ALTQ, as it's not possible for such
 	 * interfaces to be part of the configuration.
 	 */
 	if (!ALTQ_IS_READY(&ifp->if_snd))
 		return;
 
 	/* Interrupt userland queue modifications */
 	if (V_altqs_inactive_open)
 		pf_rollback_altq(V_ticket_altqs_inactive);
 
 	/* Start new altq ruleset */
 	if (pf_begin_altq(&ticket))
 		return;
 
 	/* Copy the current active set */
 	TAILQ_FOREACH(a1, V_pf_altq_ifs_active, entries) {
 		a2 = malloc(sizeof(*a2), M_PFALTQ, M_NOWAIT);
 		if (a2 == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		bcopy(a1, a2, sizeof(struct pf_altq));
 
 		error = pf_altq_ifnet_event_add(ifp, remove, ticket, a2);
 		if (error)
 			break;
 
 		TAILQ_INSERT_TAIL(V_pf_altq_ifs_inactive, a2, entries);
 	}
 	if (error)
 		goto out;
 	TAILQ_FOREACH(a1, V_pf_altqs_active, entries) {
 		a2 = malloc(sizeof(*a2), M_PFALTQ, M_NOWAIT);
 		if (a2 == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		bcopy(a1, a2, sizeof(struct pf_altq));
 
 		if ((a2->qid = pf_qname2qid(a2->qname)) == 0) {
 			error = EBUSY;
 			free(a2, M_PFALTQ);
 			break;
 		}
 		a2->altq_disc = NULL;
 		TAILQ_FOREACH(a3, V_pf_altq_ifs_inactive, entries) {
 			if (strncmp(a3->ifname, a2->ifname,
 				IFNAMSIZ) == 0) {
 				a2->altq_disc = a3->altq_disc;
 				break;
 			}
 		}
 		error = pf_altq_ifnet_event_add(ifp, remove, ticket, a2);
 		if (error)
 			break;
 
 		TAILQ_INSERT_TAIL(V_pf_altqs_inactive, a2, entries);
 	}
 
 out:
 	if (error != 0)
 		pf_rollback_altq(ticket);
 	else
 		pf_commit_altq(ticket);
 }
 #endif /* ALTQ */
 
 static struct pf_krule_global *
 pf_rule_tree_alloc(int flags)
 {
 	struct pf_krule_global *tree;
 
 	tree = malloc(sizeof(struct pf_krule_global), M_TEMP, flags);
 	if (tree == NULL)
 		return (NULL);
 	RB_INIT(tree);
 	return (tree);
 }
 
 static void
 pf_rule_tree_free(struct pf_krule_global *tree)
 {
 
 	free(tree, M_TEMP);
 }
 
 static int
 pf_begin_rules(u_int32_t *ticket, int rs_num, const char *anchor)
 {
 	struct pf_krule_global *tree;
 	struct pf_kruleset	*rs;
 	struct pf_krule		*rule;
 
 	PF_RULES_WASSERT();
 
 	if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
 		return (EINVAL);
 	tree = pf_rule_tree_alloc(M_NOWAIT);
 	if (tree == NULL)
 		return (ENOMEM);
 	rs = pf_find_or_create_kruleset(anchor);
 	if (rs == NULL) {
 		free(tree, M_TEMP);
 		return (EINVAL);
 	}
 	pf_rule_tree_free(rs->rules[rs_num].inactive.tree);
 	rs->rules[rs_num].inactive.tree = tree;
 
 	while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) {
 		pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule);
 		rs->rules[rs_num].inactive.rcount--;
 	}
 	*ticket = ++rs->rules[rs_num].inactive.ticket;
 	rs->rules[rs_num].inactive.open = 1;
 	return (0);
 }
 
 static int
 pf_rollback_rules(u_int32_t ticket, int rs_num, char *anchor)
 {
 	struct pf_kruleset	*rs;
 	struct pf_krule		*rule;
 
 	PF_RULES_WASSERT();
 
 	if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
 		return (EINVAL);
 	rs = pf_find_kruleset(anchor);
 	if (rs == NULL || !rs->rules[rs_num].inactive.open ||
 	    rs->rules[rs_num].inactive.ticket != ticket)
 		return (0);
 	while ((rule = TAILQ_FIRST(rs->rules[rs_num].inactive.ptr)) != NULL) {
 		pf_unlink_rule(rs->rules[rs_num].inactive.ptr, rule);
 		rs->rules[rs_num].inactive.rcount--;
 	}
 	rs->rules[rs_num].inactive.open = 0;
 	return (0);
 }
 
 #define PF_MD5_UPD(st, elm)						\
 		MD5Update(ctx, (u_int8_t *) &(st)->elm, sizeof((st)->elm))
 
 #define PF_MD5_UPD_STR(st, elm)						\
 		MD5Update(ctx, (u_int8_t *) (st)->elm, strlen((st)->elm))
 
 #define PF_MD5_UPD_HTONL(st, elm, stor) do {				\
 		(stor) = htonl((st)->elm);				\
 		MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int32_t));\
 } while (0)
 
 #define PF_MD5_UPD_HTONS(st, elm, stor) do {				\
 		(stor) = htons((st)->elm);				\
 		MD5Update(ctx, (u_int8_t *) &(stor), sizeof(u_int16_t));\
 } while (0)
 
 static void
 pf_hash_rule_addr(MD5_CTX *ctx, struct pf_rule_addr *pfr)
 {
 	PF_MD5_UPD(pfr, addr.type);
 	switch (pfr->addr.type) {
 		case PF_ADDR_DYNIFTL:
 			PF_MD5_UPD(pfr, addr.v.ifname);
 			PF_MD5_UPD(pfr, addr.iflags);
 			break;
 		case PF_ADDR_TABLE:
 			PF_MD5_UPD(pfr, addr.v.tblname);
 			break;
 		case PF_ADDR_ADDRMASK:
 			/* XXX ignore af? */
 			PF_MD5_UPD(pfr, addr.v.a.addr.addr32);
 			PF_MD5_UPD(pfr, addr.v.a.mask.addr32);
 			break;
 	}
 
 	PF_MD5_UPD(pfr, port[0]);
 	PF_MD5_UPD(pfr, port[1]);
 	PF_MD5_UPD(pfr, neg);
 	PF_MD5_UPD(pfr, port_op);
 }
 
 static void
 pf_hash_rule_rolling(MD5_CTX *ctx, struct pf_krule *rule)
 {
 	u_int16_t x;
 	u_int32_t y;
 
 	pf_hash_rule_addr(ctx, &rule->src);
 	pf_hash_rule_addr(ctx, &rule->dst);
 	for (int i = 0; i < PF_RULE_MAX_LABEL_COUNT; i++)
 		PF_MD5_UPD_STR(rule, label[i]);
 	PF_MD5_UPD_STR(rule, ifname);
 	PF_MD5_UPD_STR(rule, match_tagname);
 	PF_MD5_UPD_HTONS(rule, match_tag, x); /* dup? */
 	PF_MD5_UPD_HTONL(rule, os_fingerprint, y);
 	PF_MD5_UPD_HTONL(rule, prob, y);
 	PF_MD5_UPD_HTONL(rule, uid.uid[0], y);
 	PF_MD5_UPD_HTONL(rule, uid.uid[1], y);
 	PF_MD5_UPD(rule, uid.op);
 	PF_MD5_UPD_HTONL(rule, gid.gid[0], y);
 	PF_MD5_UPD_HTONL(rule, gid.gid[1], y);
 	PF_MD5_UPD(rule, gid.op);
 	PF_MD5_UPD_HTONL(rule, rule_flag, y);
 	PF_MD5_UPD(rule, action);
 	PF_MD5_UPD(rule, direction);
 	PF_MD5_UPD(rule, af);
 	PF_MD5_UPD(rule, quick);
 	PF_MD5_UPD(rule, ifnot);
 	PF_MD5_UPD(rule, match_tag_not);
 	PF_MD5_UPD(rule, natpass);
 	PF_MD5_UPD(rule, keep_state);
 	PF_MD5_UPD(rule, proto);
 	PF_MD5_UPD(rule, type);
 	PF_MD5_UPD(rule, code);
 	PF_MD5_UPD(rule, flags);
 	PF_MD5_UPD(rule, flagset);
 	PF_MD5_UPD(rule, allow_opts);
 	PF_MD5_UPD(rule, rt);
 	PF_MD5_UPD(rule, tos);
 	if (rule->anchor != NULL)
 		PF_MD5_UPD_STR(rule, anchor->path);
 }
 
 static void
 pf_hash_rule(struct pf_krule *rule)
 {
 	MD5_CTX		ctx;
 
 	MD5Init(&ctx);
 	pf_hash_rule_rolling(&ctx, rule);
 	MD5Final(rule->md5sum, &ctx);
 }
 
 static int
 pf_krule_compare(struct pf_krule *a, struct pf_krule *b)
 {
 
 	return (memcmp(a->md5sum, b->md5sum, PF_MD5_DIGEST_LENGTH));
 }
 
 static int
 pf_commit_rules(u_int32_t ticket, int rs_num, char *anchor)
 {
 	struct pf_kruleset	*rs;
 	struct pf_krule		*rule, **old_array, *old_rule;
 	struct pf_krulequeue	*old_rules;
 	struct pf_krule_global  *old_tree;
 	int			 error;
 	u_int32_t		 old_rcount;
 
 	PF_RULES_WASSERT();
 
 	if (rs_num < 0 || rs_num >= PF_RULESET_MAX)
 		return (EINVAL);
 	rs = pf_find_kruleset(anchor);
 	if (rs == NULL || !rs->rules[rs_num].inactive.open ||
 	    ticket != rs->rules[rs_num].inactive.ticket)
 		return (EBUSY);
 
 	/* Calculate checksum for the main ruleset */
 	if (rs == &pf_main_ruleset) {
 		error = pf_setup_pfsync_matching(rs);
 		if (error != 0)
 			return (error);
 	}
 
 	/* Swap rules, keep the old. */
 	old_rules = rs->rules[rs_num].active.ptr;
 	old_rcount = rs->rules[rs_num].active.rcount;
 	old_array = rs->rules[rs_num].active.ptr_array;
 	old_tree = rs->rules[rs_num].active.tree;
 
 	rs->rules[rs_num].active.ptr =
 	    rs->rules[rs_num].inactive.ptr;
 	rs->rules[rs_num].active.ptr_array =
 	    rs->rules[rs_num].inactive.ptr_array;
 	rs->rules[rs_num].active.tree =
 	    rs->rules[rs_num].inactive.tree;
 	rs->rules[rs_num].active.rcount =
 	    rs->rules[rs_num].inactive.rcount;
 
 	/* Attempt to preserve counter information. */
 	if (V_pf_status.keep_counters && old_tree != NULL) {
 		TAILQ_FOREACH(rule, rs->rules[rs_num].active.ptr,
 		    entries) {
 			old_rule = RB_FIND(pf_krule_global, old_tree, rule);
 			if (old_rule == NULL) {
 				continue;
 			}
 			pf_counter_u64_critical_enter();
 			pf_counter_u64_add_protected(&rule->evaluations,
 			    pf_counter_u64_fetch(&old_rule->evaluations));
 			pf_counter_u64_add_protected(&rule->packets[0],
 			    pf_counter_u64_fetch(&old_rule->packets[0]));
 			pf_counter_u64_add_protected(&rule->packets[1],
 			    pf_counter_u64_fetch(&old_rule->packets[1]));
 			pf_counter_u64_add_protected(&rule->bytes[0],
 			    pf_counter_u64_fetch(&old_rule->bytes[0]));
 			pf_counter_u64_add_protected(&rule->bytes[1],
 			    pf_counter_u64_fetch(&old_rule->bytes[1]));
 			pf_counter_u64_critical_exit();
 		}
 	}
 
 	rs->rules[rs_num].inactive.ptr = old_rules;
 	rs->rules[rs_num].inactive.ptr_array = old_array;
 	rs->rules[rs_num].inactive.tree = NULL; /* important for pf_ioctl_addrule */
 	rs->rules[rs_num].inactive.rcount = old_rcount;
 
 	rs->rules[rs_num].active.ticket =
 	    rs->rules[rs_num].inactive.ticket;
 	pf_calc_skip_steps(rs->rules[rs_num].active.ptr);
 
 	/* Purge the old rule list. */
 	PF_UNLNKDRULES_LOCK();
 	while ((rule = TAILQ_FIRST(old_rules)) != NULL)
 		pf_unlink_rule_locked(old_rules, rule);
 	PF_UNLNKDRULES_UNLOCK();
 	if (rs->rules[rs_num].inactive.ptr_array)
 		free(rs->rules[rs_num].inactive.ptr_array, M_TEMP);
 	rs->rules[rs_num].inactive.ptr_array = NULL;
 	rs->rules[rs_num].inactive.rcount = 0;
 	rs->rules[rs_num].inactive.open = 0;
 	pf_remove_if_empty_kruleset(rs);
 	free(old_tree, M_TEMP);
 
 	return (0);
 }
 
 static int
 pf_setup_pfsync_matching(struct pf_kruleset *rs)
 {
 	MD5_CTX			 ctx;
 	struct pf_krule		*rule;
 	int			 rs_cnt;
 	u_int8_t		 digest[PF_MD5_DIGEST_LENGTH];
 
 	MD5Init(&ctx);
 	for (rs_cnt = 0; rs_cnt < PF_RULESET_MAX; rs_cnt++) {
 		/* XXX PF_RULESET_SCRUB as well? */
 		if (rs_cnt == PF_RULESET_SCRUB)
 			continue;
 
 		if (rs->rules[rs_cnt].inactive.ptr_array)
 			free(rs->rules[rs_cnt].inactive.ptr_array, M_TEMP);
 		rs->rules[rs_cnt].inactive.ptr_array = NULL;
 
 		if (rs->rules[rs_cnt].inactive.rcount) {
 			rs->rules[rs_cnt].inactive.ptr_array =
 			    malloc(sizeof(caddr_t) *
 			    rs->rules[rs_cnt].inactive.rcount,
 			    M_TEMP, M_NOWAIT);
 
 			if (!rs->rules[rs_cnt].inactive.ptr_array)
 				return (ENOMEM);
 		}
 
 		TAILQ_FOREACH(rule, rs->rules[rs_cnt].inactive.ptr,
 		    entries) {
 			pf_hash_rule_rolling(&ctx, rule);
 			(rs->rules[rs_cnt].inactive.ptr_array)[rule->nr] = rule;
 		}
 	}
 
 	MD5Final(digest, &ctx);
 	memcpy(V_pf_status.pf_chksum, digest, sizeof(V_pf_status.pf_chksum));
 	return (0);
 }
 
 static int
 pf_eth_addr_setup(struct pf_keth_ruleset *ruleset, struct pf_addr_wrap *addr)
 {
 	int error = 0;
 
 	switch (addr->type) {
 	case PF_ADDR_TABLE:
 		addr->p.tbl = pfr_eth_attach_table(ruleset, addr->v.tblname);
 		if (addr->p.tbl == NULL)
 			error = ENOMEM;
 		break;
 	default:
 		error = EINVAL;
 	}
 
 	return (error);
 }
 
 static int
 pf_addr_setup(struct pf_kruleset *ruleset, struct pf_addr_wrap *addr,
     sa_family_t af)
 {
 	int error = 0;
 
 	switch (addr->type) {
 	case PF_ADDR_TABLE:
 		addr->p.tbl = pfr_attach_table(ruleset, addr->v.tblname);
 		if (addr->p.tbl == NULL)
 			error = ENOMEM;
 		break;
 	case PF_ADDR_DYNIFTL:
 		error = pfi_dynaddr_setup(addr, af);
 		break;
 	}
 
 	return (error);
 }
 
 static void
 pf_addr_copyout(struct pf_addr_wrap *addr)
 {
 
 	switch (addr->type) {
 	case PF_ADDR_DYNIFTL:
 		pfi_dynaddr_copyout(addr);
 		break;
 	case PF_ADDR_TABLE:
 		pf_tbladdr_copyout(addr);
 		break;
 	}
 }
 
 static void
 pf_src_node_copy(const struct pf_ksrc_node *in, struct pf_src_node *out)
 {
 	int	secs = time_uptime, diff;
 
 	bzero(out, sizeof(struct pf_src_node));
 
 	bcopy(&in->addr, &out->addr, sizeof(struct pf_addr));
 	bcopy(&in->raddr, &out->raddr, sizeof(struct pf_addr));
 
 	if (in->rule.ptr != NULL)
 		out->rule.nr = in->rule.ptr->nr;
 
 	for (int i = 0; i < 2; i++) {
 		out->bytes[i] = counter_u64_fetch(in->bytes[i]);
 		out->packets[i] = counter_u64_fetch(in->packets[i]);
 	}
 
 	out->states = in->states;
 	out->conn = in->conn;
 	out->af = in->af;
 	out->ruletype = in->ruletype;
 
 	out->creation = secs - in->creation;
 	if (out->expire > secs)
 		out->expire -= secs;
 	else
 		out->expire = 0;
 
 	/* Adjust the connection rate estimate. */
 	diff = secs - in->conn_rate.last;
 	if (diff >= in->conn_rate.seconds)
 		out->conn_rate.count = 0;
 	else
 		out->conn_rate.count -=
 		    in->conn_rate.count * diff /
 		    in->conn_rate.seconds;
 }
 
 #ifdef ALTQ
 /*
  * Handle export of struct pf_kaltq to user binaries that may be using any
  * version of struct pf_altq.
  */
 static int
 pf_export_kaltq(struct pf_altq *q, struct pfioc_altq_v1 *pa, size_t ioc_size)
 {
 	u_int32_t version;
 
 	if (ioc_size == sizeof(struct pfioc_altq_v0))
 		version = 0;
 	else
 		version = pa->version;
 
 	if (version > PFIOC_ALTQ_VERSION)
 		return (EINVAL);
 
 #define ASSIGN(x) exported_q->x = q->x
 #define COPY(x) \
 	bcopy(&q->x, &exported_q->x, min(sizeof(q->x), sizeof(exported_q->x)))
 #define SATU16(x) (u_int32_t)uqmin((x), USHRT_MAX)
 #define SATU32(x) (u_int32_t)uqmin((x), UINT_MAX)
 
 	switch (version) {
 	case 0: {
 		struct pf_altq_v0 *exported_q =
 		    &((struct pfioc_altq_v0 *)pa)->altq;
 
 		COPY(ifname);
 
 		ASSIGN(scheduler);
 		ASSIGN(tbrsize);
 		exported_q->tbrsize = SATU16(q->tbrsize);
 		exported_q->ifbandwidth = SATU32(q->ifbandwidth);
 
 		COPY(qname);
 		COPY(parent);
 		ASSIGN(parent_qid);
 		exported_q->bandwidth = SATU32(q->bandwidth);
 		ASSIGN(priority);
 		ASSIGN(local_flags);
 
 		ASSIGN(qlimit);
 		ASSIGN(flags);
 
 		if (q->scheduler == ALTQT_HFSC) {
 #define ASSIGN_OPT(x) exported_q->pq_u.hfsc_opts.x = q->pq_u.hfsc_opts.x
 #define ASSIGN_OPT_SATU32(x) exported_q->pq_u.hfsc_opts.x = \
 			    SATU32(q->pq_u.hfsc_opts.x)
 			
 			ASSIGN_OPT_SATU32(rtsc_m1);
 			ASSIGN_OPT(rtsc_d);
 			ASSIGN_OPT_SATU32(rtsc_m2);
 
 			ASSIGN_OPT_SATU32(lssc_m1);
 			ASSIGN_OPT(lssc_d);
 			ASSIGN_OPT_SATU32(lssc_m2);
 
 			ASSIGN_OPT_SATU32(ulsc_m1);
 			ASSIGN_OPT(ulsc_d);
 			ASSIGN_OPT_SATU32(ulsc_m2);
 
 			ASSIGN_OPT(flags);
 			
 #undef ASSIGN_OPT
 #undef ASSIGN_OPT_SATU32
 		} else
 			COPY(pq_u);
 
 		ASSIGN(qid);
 		break;
 	}
 	case 1:	{
 		struct pf_altq_v1 *exported_q =
 		    &((struct pfioc_altq_v1 *)pa)->altq;
 
 		COPY(ifname);
 
 		ASSIGN(scheduler);
 		ASSIGN(tbrsize);
 		ASSIGN(ifbandwidth);
 
 		COPY(qname);
 		COPY(parent);
 		ASSIGN(parent_qid);
 		ASSIGN(bandwidth);
 		ASSIGN(priority);
 		ASSIGN(local_flags);
 
 		ASSIGN(qlimit);
 		ASSIGN(flags);
 		COPY(pq_u);
 
 		ASSIGN(qid);
 		break;
 	}
 	default:
 		panic("%s: unhandled struct pfioc_altq version", __func__);
 		break;
 	}
 
 #undef ASSIGN
 #undef COPY
 #undef SATU16
 #undef SATU32
 
 	return (0);
 }
 
 /*
  * Handle import to struct pf_kaltq of struct pf_altq from user binaries
  * that may be using any version of it.
  */
 static int
 pf_import_kaltq(struct pfioc_altq_v1 *pa, struct pf_altq *q, size_t ioc_size)
 {
 	u_int32_t version;
 
 	if (ioc_size == sizeof(struct pfioc_altq_v0))
 		version = 0;
 	else
 		version = pa->version;
 
 	if (version > PFIOC_ALTQ_VERSION)
 		return (EINVAL);
 
 #define ASSIGN(x) q->x = imported_q->x
 #define COPY(x) \
 	bcopy(&imported_q->x, &q->x, min(sizeof(imported_q->x), sizeof(q->x)))
 
 	switch (version) {
 	case 0: {
 		struct pf_altq_v0 *imported_q =
 		    &((struct pfioc_altq_v0 *)pa)->altq;
 
 		COPY(ifname);
 
 		ASSIGN(scheduler);
 		ASSIGN(tbrsize); /* 16-bit -> 32-bit */
 		ASSIGN(ifbandwidth); /* 32-bit -> 64-bit */
 
 		COPY(qname);
 		COPY(parent);
 		ASSIGN(parent_qid);
 		ASSIGN(bandwidth); /* 32-bit -> 64-bit */
 		ASSIGN(priority);
 		ASSIGN(local_flags);
 
 		ASSIGN(qlimit);
 		ASSIGN(flags);
 
 		if (imported_q->scheduler == ALTQT_HFSC) {
 #define ASSIGN_OPT(x) q->pq_u.hfsc_opts.x = imported_q->pq_u.hfsc_opts.x
 
 			/*
 			 * The m1 and m2 parameters are being copied from
 			 * 32-bit to 64-bit.
 			 */
 			ASSIGN_OPT(rtsc_m1);
 			ASSIGN_OPT(rtsc_d);
 			ASSIGN_OPT(rtsc_m2);
 
 			ASSIGN_OPT(lssc_m1);
 			ASSIGN_OPT(lssc_d);
 			ASSIGN_OPT(lssc_m2);
 
 			ASSIGN_OPT(ulsc_m1);
 			ASSIGN_OPT(ulsc_d);
 			ASSIGN_OPT(ulsc_m2);
 
 			ASSIGN_OPT(flags);
 			
 #undef ASSIGN_OPT
 		} else
 			COPY(pq_u);
 
 		ASSIGN(qid);
 		break;
 	}
 	case 1: {
 		struct pf_altq_v1 *imported_q =
 		    &((struct pfioc_altq_v1 *)pa)->altq;
 
 		COPY(ifname);
 
 		ASSIGN(scheduler);
 		ASSIGN(tbrsize);
 		ASSIGN(ifbandwidth);
 
 		COPY(qname);
 		COPY(parent);
 		ASSIGN(parent_qid);
 		ASSIGN(bandwidth);
 		ASSIGN(priority);
 		ASSIGN(local_flags);
 
 		ASSIGN(qlimit);
 		ASSIGN(flags);
 		COPY(pq_u);
 
 		ASSIGN(qid);
 		break;
 	}
 	default:	
 		panic("%s: unhandled struct pfioc_altq version", __func__);
 		break;
 	}
 
 #undef ASSIGN
 #undef COPY
 
 	return (0);
 }
 
 static struct pf_altq *
 pf_altq_get_nth_active(u_int32_t n)
 {
 	struct pf_altq		*altq;
 	u_int32_t		 nr;
 
 	nr = 0;
 	TAILQ_FOREACH(altq, V_pf_altq_ifs_active, entries) {
 		if (nr == n)
 			return (altq);
 		nr++;
 	}
 
 	TAILQ_FOREACH(altq, V_pf_altqs_active, entries) {
 		if (nr == n)
 			return (altq);
 		nr++;
 	}
 
 	return (NULL);
 }
 #endif /* ALTQ */
 
 struct pf_krule *
 pf_krule_alloc(void)
 {
 	struct pf_krule *rule;
 
 	rule = malloc(sizeof(struct pf_krule), M_PFRULE, M_WAITOK | M_ZERO);
 	mtx_init(&rule->rpool.mtx, "pf_krule_pool", NULL, MTX_DEF);
 	rule->timestamp = uma_zalloc_pcpu(pf_timestamp_pcpu_zone,
 	    M_WAITOK | M_ZERO);
 	return (rule);
 }
 
 void
 pf_krule_free(struct pf_krule *rule)
 {
 #ifdef PF_WANT_32_TO_64_COUNTER
 	bool wowned;
 #endif
 
 	if (rule == NULL)
 		return;
 
 #ifdef PF_WANT_32_TO_64_COUNTER
 	if (rule->allrulelinked) {
 		wowned = PF_RULES_WOWNED();
 		if (!wowned)
 			PF_RULES_WLOCK();
 		LIST_REMOVE(rule, allrulelist);
 		V_pf_allrulecount--;
 		if (!wowned)
 			PF_RULES_WUNLOCK();
 	}
 #endif
 
 	pf_counter_u64_deinit(&rule->evaluations);
 	for (int i = 0; i < 2; i++) {
 		pf_counter_u64_deinit(&rule->packets[i]);
 		pf_counter_u64_deinit(&rule->bytes[i]);
 	}
 	counter_u64_free(rule->states_cur);
 	counter_u64_free(rule->states_tot);
 	counter_u64_free(rule->src_nodes);
 	uma_zfree_pcpu(pf_timestamp_pcpu_zone, rule->timestamp);
 
 	mtx_destroy(&rule->rpool.mtx);
 	free(rule, M_PFRULE);
 }
 
 static void
 pf_kpooladdr_to_pooladdr(const struct pf_kpooladdr *kpool,
     struct pf_pooladdr *pool)
 {
 
 	bzero(pool, sizeof(*pool));
 	bcopy(&kpool->addr, &pool->addr, sizeof(pool->addr));
 	strlcpy(pool->ifname, kpool->ifname, sizeof(pool->ifname));
 }
 
 static int
 pf_pooladdr_to_kpooladdr(const struct pf_pooladdr *pool,
     struct pf_kpooladdr *kpool)
 {
 	int ret;
 
 	bzero(kpool, sizeof(*kpool));
 	bcopy(&pool->addr, &kpool->addr, sizeof(kpool->addr));
 	ret = pf_user_strcpy(kpool->ifname, pool->ifname,
 	    sizeof(kpool->ifname));
 	return (ret);
 }
 
 static void
 pf_kpool_to_pool(const struct pf_kpool *kpool, struct pf_pool *pool)
 {
 	bzero(pool, sizeof(*pool));
 
 	bcopy(&kpool->key, &pool->key, sizeof(pool->key));
 	bcopy(&kpool->counter, &pool->counter, sizeof(pool->counter));
 
 	pool->tblidx = kpool->tblidx;
 	pool->proxy_port[0] = kpool->proxy_port[0];
 	pool->proxy_port[1] = kpool->proxy_port[1];
 	pool->opts = kpool->opts;
 }
 
 static void
 pf_pool_to_kpool(const struct pf_pool *pool, struct pf_kpool *kpool)
 {
 	_Static_assert(sizeof(pool->key) == sizeof(kpool->key), "");
 	_Static_assert(sizeof(pool->counter) == sizeof(kpool->counter), "");
 
 	bcopy(&pool->key, &kpool->key, sizeof(kpool->key));
 	bcopy(&pool->counter, &kpool->counter, sizeof(kpool->counter));
 
 	kpool->tblidx = pool->tblidx;
 	kpool->proxy_port[0] = pool->proxy_port[0];
 	kpool->proxy_port[1] = pool->proxy_port[1];
 	kpool->opts = pool->opts;
 }
 
 static void
 pf_krule_to_rule(const struct pf_krule *krule, struct pf_rule *rule)
 {
 
 	bzero(rule, sizeof(*rule));
 
 	bcopy(&krule->src, &rule->src, sizeof(rule->src));
 	bcopy(&krule->dst, &rule->dst, sizeof(rule->dst));
 
 	for (int i = 0; i < PF_SKIP_COUNT; ++i) {
 		if (rule->skip[i].ptr == NULL)
 			rule->skip[i].nr = -1;
 		else
 			rule->skip[i].nr = krule->skip[i].ptr->nr;
 	}
 
 	strlcpy(rule->label, krule->label[0], sizeof(rule->label));
 	strlcpy(rule->ifname, krule->ifname, sizeof(rule->ifname));
 	strlcpy(rule->qname, krule->qname, sizeof(rule->qname));
 	strlcpy(rule->pqname, krule->pqname, sizeof(rule->pqname));
 	strlcpy(rule->tagname, krule->tagname, sizeof(rule->tagname));
 	strlcpy(rule->match_tagname, krule->match_tagname,
 	    sizeof(rule->match_tagname));
 	strlcpy(rule->overload_tblname, krule->overload_tblname,
 	    sizeof(rule->overload_tblname));
 
 	pf_kpool_to_pool(&krule->rpool, &rule->rpool);
 
 	rule->evaluations = pf_counter_u64_fetch(&krule->evaluations);
 	for (int i = 0; i < 2; i++) {
 		rule->packets[i] = pf_counter_u64_fetch(&krule->packets[i]);
 		rule->bytes[i] = pf_counter_u64_fetch(&krule->bytes[i]);
 	}
 
 	/* kif, anchor, overload_tbl are not copied over. */
 
 	rule->os_fingerprint = krule->os_fingerprint;
 
 	rule->rtableid = krule->rtableid;
 	bcopy(krule->timeout, rule->timeout, sizeof(krule->timeout));
 	rule->max_states = krule->max_states;
 	rule->max_src_nodes = krule->max_src_nodes;
 	rule->max_src_states = krule->max_src_states;
 	rule->max_src_conn = krule->max_src_conn;
 	rule->max_src_conn_rate.limit = krule->max_src_conn_rate.limit;
 	rule->max_src_conn_rate.seconds = krule->max_src_conn_rate.seconds;
 	rule->qid = krule->qid;
 	rule->pqid = krule->pqid;
 	rule->nr = krule->nr;
 	rule->prob = krule->prob;
 	rule->cuid = krule->cuid;
 	rule->cpid = krule->cpid;
 
 	rule->return_icmp = krule->return_icmp;
 	rule->return_icmp6 = krule->return_icmp6;
 	rule->max_mss = krule->max_mss;
 	rule->tag = krule->tag;
 	rule->match_tag = krule->match_tag;
 	rule->scrub_flags = krule->scrub_flags;
 
 	bcopy(&krule->uid, &rule->uid, sizeof(krule->uid));
 	bcopy(&krule->gid, &rule->gid, sizeof(krule->gid));
 
 	rule->rule_flag = krule->rule_flag;
 	rule->action = krule->action;
 	rule->direction = krule->direction;
 	rule->log = krule->log;
 	rule->logif = krule->logif;
 	rule->quick = krule->quick;
 	rule->ifnot = krule->ifnot;
 	rule->match_tag_not = krule->match_tag_not;
 	rule->natpass = krule->natpass;
 
 	rule->keep_state = krule->keep_state;
 	rule->af = krule->af;
 	rule->proto = krule->proto;
 	rule->type = krule->type;
 	rule->code = krule->code;
 	rule->flags = krule->flags;
 	rule->flagset = krule->flagset;
 	rule->min_ttl = krule->min_ttl;
 	rule->allow_opts = krule->allow_opts;
 	rule->rt = krule->rt;
 	rule->return_ttl = krule->return_ttl;
 	rule->tos = krule->tos;
 	rule->set_tos = krule->set_tos;
 	rule->anchor_relative = krule->anchor_relative;
 	rule->anchor_wildcard = krule->anchor_wildcard;
 
 	rule->flush = krule->flush;
 	rule->prio = krule->prio;
 	rule->set_prio[0] = krule->set_prio[0];
 	rule->set_prio[1] = krule->set_prio[1];
 
 	bcopy(&krule->divert, &rule->divert, sizeof(krule->divert));
 
 	rule->u_states_cur = counter_u64_fetch(krule->states_cur);
 	rule->u_states_tot = counter_u64_fetch(krule->states_tot);
 	rule->u_src_nodes = counter_u64_fetch(krule->src_nodes);
 }
 
 static int
 pf_rule_to_krule(const struct pf_rule *rule, struct pf_krule *krule)
 {
 	int ret;
 
 #ifndef INET
 	if (rule->af == AF_INET) {
 		return (EAFNOSUPPORT);
 	}
 #endif /* INET */
 #ifndef INET6
 	if (rule->af == AF_INET6) {
 		return (EAFNOSUPPORT);
 	}
 #endif /* INET6 */
 
 	ret = pf_check_rule_addr(&rule->src);
 	if (ret != 0)
 		return (ret);
 	ret = pf_check_rule_addr(&rule->dst);
 	if (ret != 0)
 		return (ret);
 
 	bcopy(&rule->src, &krule->src, sizeof(rule->src));
 	bcopy(&rule->dst, &krule->dst, sizeof(rule->dst));
 
 	ret = pf_user_strcpy(krule->label[0], rule->label, sizeof(rule->label));
 	if (ret != 0)
 		return (ret);
 	ret = pf_user_strcpy(krule->ifname, rule->ifname, sizeof(rule->ifname));
 	if (ret != 0)
 		return (ret);
 	ret = pf_user_strcpy(krule->qname, rule->qname, sizeof(rule->qname));
 	if (ret != 0)
 		return (ret);
 	ret = pf_user_strcpy(krule->pqname, rule->pqname, sizeof(rule->pqname));
 	if (ret != 0)
 		return (ret);
 	ret = pf_user_strcpy(krule->tagname, rule->tagname,
 	    sizeof(rule->tagname));
 	if (ret != 0)
 		return (ret);
 	ret = pf_user_strcpy(krule->match_tagname, rule->match_tagname,
 	    sizeof(rule->match_tagname));
 	if (ret != 0)
 		return (ret);
 	ret = pf_user_strcpy(krule->overload_tblname, rule->overload_tblname,
 	    sizeof(rule->overload_tblname));
 	if (ret != 0)
 		return (ret);
 
 	pf_pool_to_kpool(&rule->rpool, &krule->rpool);
 
 	/* Don't allow userspace to set evaulations, packets or bytes. */
 	/* kif, anchor, overload_tbl are not copied over. */
 
 	krule->os_fingerprint = rule->os_fingerprint;
 
 	krule->rtableid = rule->rtableid;
 	bcopy(rule->timeout, krule->timeout, sizeof(krule->timeout));
 	krule->max_states = rule->max_states;
 	krule->max_src_nodes = rule->max_src_nodes;
 	krule->max_src_states = rule->max_src_states;
 	krule->max_src_conn = rule->max_src_conn;
 	krule->max_src_conn_rate.limit = rule->max_src_conn_rate.limit;
 	krule->max_src_conn_rate.seconds = rule->max_src_conn_rate.seconds;
 	krule->qid = rule->qid;
 	krule->pqid = rule->pqid;
 	krule->nr = rule->nr;
 	krule->prob = rule->prob;
 	krule->cuid = rule->cuid;
 	krule->cpid = rule->cpid;
 
 	krule->return_icmp = rule->return_icmp;
 	krule->return_icmp6 = rule->return_icmp6;
 	krule->max_mss = rule->max_mss;
 	krule->tag = rule->tag;
 	krule->match_tag = rule->match_tag;
 	krule->scrub_flags = rule->scrub_flags;
 
 	bcopy(&rule->uid, &krule->uid, sizeof(krule->uid));
 	bcopy(&rule->gid, &krule->gid, sizeof(krule->gid));
 
 	krule->rule_flag = rule->rule_flag;
 	krule->action = rule->action;
 	krule->direction = rule->direction;
 	krule->log = rule->log;
 	krule->logif = rule->logif;
 	krule->quick = rule->quick;
 	krule->ifnot = rule->ifnot;
 	krule->match_tag_not = rule->match_tag_not;
 	krule->natpass = rule->natpass;
 
 	krule->keep_state = rule->keep_state;
 	krule->af = rule->af;
 	krule->proto = rule->proto;
 	krule->type = rule->type;
 	krule->code = rule->code;
 	krule->flags = rule->flags;
 	krule->flagset = rule->flagset;
 	krule->min_ttl = rule->min_ttl;
 	krule->allow_opts = rule->allow_opts;
 	krule->rt = rule->rt;
 	krule->return_ttl = rule->return_ttl;
 	krule->tos = rule->tos;
 	krule->set_tos = rule->set_tos;
 
 	krule->flush = rule->flush;
 	krule->prio = rule->prio;
 	krule->set_prio[0] = rule->set_prio[0];
 	krule->set_prio[1] = rule->set_prio[1];
 
 	bcopy(&rule->divert, &krule->divert, sizeof(krule->divert));
 
 	return (0);
 }
 
 static int
 pf_state_kill_to_kstate_kill(const struct pfioc_state_kill *psk,
     struct pf_kstate_kill *kill)
 {
 	int ret;
 
 	bzero(kill, sizeof(*kill));
 
 	bcopy(&psk->psk_pfcmp, &kill->psk_pfcmp, sizeof(kill->psk_pfcmp));
 	kill->psk_af = psk->psk_af;
 	kill->psk_proto = psk->psk_proto;
 	bcopy(&psk->psk_src, &kill->psk_src, sizeof(kill->psk_src));
 	bcopy(&psk->psk_dst, &kill->psk_dst, sizeof(kill->psk_dst));
 	ret = pf_user_strcpy(kill->psk_ifname, psk->psk_ifname,
 	    sizeof(kill->psk_ifname));
 	if (ret != 0)
 		return (ret);
 	ret = pf_user_strcpy(kill->psk_label, psk->psk_label,
 	    sizeof(kill->psk_label));
 	if (ret != 0)
 		return (ret);
 
 	return (0);
 }
 
 static int
 pf_ioctl_addrule(struct pf_krule *rule, uint32_t ticket,
     uint32_t pool_ticket, const char *anchor, const char *anchor_call,
     struct thread *td)
 {
 	struct pf_kruleset	*ruleset;
 	struct pf_krule		*tail;
 	struct pf_kpooladdr	*pa;
 	struct pfi_kkif		*kif = NULL;
 	int			 rs_num;
 	int			 error = 0;
 
 	if ((rule->return_icmp >> 8) > ICMP_MAXTYPE) {
 		error = EINVAL;
 		goto errout_unlocked;
 	}
 
 #define	ERROUT(x)	ERROUT_FUNCTION(errout, x)
 
 	if (rule->ifname[0])
 		kif = pf_kkif_create(M_WAITOK);
 	pf_counter_u64_init(&rule->evaluations, M_WAITOK);
 	for (int i = 0; i < 2; i++) {
 		pf_counter_u64_init(&rule->packets[i], M_WAITOK);
 		pf_counter_u64_init(&rule->bytes[i], M_WAITOK);
 	}
 	rule->states_cur = counter_u64_alloc(M_WAITOK);
 	rule->states_tot = counter_u64_alloc(M_WAITOK);
 	rule->src_nodes = counter_u64_alloc(M_WAITOK);
 	rule->cuid = td->td_ucred->cr_ruid;
 	rule->cpid = td->td_proc ? td->td_proc->p_pid : 0;
 	TAILQ_INIT(&rule->rpool.list);
 
 	PF_CONFIG_LOCK();
 	PF_RULES_WLOCK();
 #ifdef PF_WANT_32_TO_64_COUNTER
 	LIST_INSERT_HEAD(&V_pf_allrulelist, rule, allrulelist);
 	MPASS(!rule->allrulelinked);
 	rule->allrulelinked = true;
 	V_pf_allrulecount++;
 #endif
 	ruleset = pf_find_kruleset(anchor);
 	if (ruleset == NULL)
 		ERROUT(EINVAL);
 	rs_num = pf_get_ruleset_number(rule->action);
 	if (rs_num >= PF_RULESET_MAX)
 		ERROUT(EINVAL);
 	if (ticket != ruleset->rules[rs_num].inactive.ticket) {
 		DPFPRINTF(PF_DEBUG_MISC,
 		    ("ticket: %d != [%d]%d\n", ticket, rs_num,
 		    ruleset->rules[rs_num].inactive.ticket));
 		ERROUT(EBUSY);
 	}
 	if (pool_ticket != V_ticket_pabuf) {
 		DPFPRINTF(PF_DEBUG_MISC,
 		    ("pool_ticket: %d != %d\n", pool_ticket,
 		    V_ticket_pabuf));
 		ERROUT(EBUSY);
 	}
 	/*
 	 * XXXMJG hack: there is no mechanism to ensure they started the
 	 * transaction. Ticket checked above may happen to match by accident,
 	 * even if nobody called DIOCXBEGIN, let alone this process.
 	 * Partially work around it by checking if the RB tree got allocated,
 	 * see pf_begin_rules.
 	 */
 	if (ruleset->rules[rs_num].inactive.tree == NULL) {
 		ERROUT(EINVAL);
 	}
 
 	tail = TAILQ_LAST(ruleset->rules[rs_num].inactive.ptr,
 	    pf_krulequeue);
 	if (tail)
 		rule->nr = tail->nr + 1;
 	else
 		rule->nr = 0;
 	if (rule->ifname[0]) {
 		rule->kif = pfi_kkif_attach(kif, rule->ifname);
 		kif = NULL;
 		pfi_kkif_ref(rule->kif);
 	} else
 		rule->kif = NULL;
 
 	if (rule->rtableid > 0 && rule->rtableid >= rt_numfibs)
 		error = EBUSY;
 
 #ifdef ALTQ
 	/* set queue IDs */
 	if (rule->qname[0] != 0) {
 		if ((rule->qid = pf_qname2qid(rule->qname)) == 0)
 			error = EBUSY;
 		else if (rule->pqname[0] != 0) {
 			if ((rule->pqid =
 			    pf_qname2qid(rule->pqname)) == 0)
 				error = EBUSY;
 		} else
 			rule->pqid = rule->qid;
 	}
 #endif
 	if (rule->tagname[0])
 		if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0)
 			error = EBUSY;
 	if (rule->match_tagname[0])
 		if ((rule->match_tag =
 		    pf_tagname2tag(rule->match_tagname)) == 0)
 			error = EBUSY;
 	if (rule->rt && !rule->direction)
 		error = EINVAL;
 	if (!rule->log)
 		rule->logif = 0;
 	if (rule->logif >= PFLOGIFS_MAX)
 		error = EINVAL;
 	if (pf_addr_setup(ruleset, &rule->src.addr, rule->af))
 		error = ENOMEM;
 	if (pf_addr_setup(ruleset, &rule->dst.addr, rule->af))
 		error = ENOMEM;
 	if (pf_kanchor_setup(rule, ruleset, anchor_call))
 		error = EINVAL;
 	if (rule->scrub_flags & PFSTATE_SETPRIO &&
 	    (rule->set_prio[0] > PF_PRIO_MAX ||
 	    rule->set_prio[1] > PF_PRIO_MAX))
 		error = EINVAL;
 	TAILQ_FOREACH(pa, &V_pf_pabuf, entries)
 		if (pa->addr.type == PF_ADDR_TABLE) {
 			pa->addr.p.tbl = pfr_attach_table(ruleset,
 			    pa->addr.v.tblname);
 			if (pa->addr.p.tbl == NULL)
 				error = ENOMEM;
 		}
 
 	rule->overload_tbl = NULL;
 	if (rule->overload_tblname[0]) {
 		if ((rule->overload_tbl = pfr_attach_table(ruleset,
 		    rule->overload_tblname)) == NULL)
 			error = EINVAL;
 		else
 			rule->overload_tbl->pfrkt_flags |=
 			    PFR_TFLAG_ACTIVE;
 	}
 
 	pf_mv_kpool(&V_pf_pabuf, &rule->rpool.list);
 	if (((((rule->action == PF_NAT) || (rule->action == PF_RDR) ||
 	    (rule->action == PF_BINAT)) && rule->anchor == NULL) ||
 	    (rule->rt > PF_NOPFROUTE)) &&
 	    (TAILQ_FIRST(&rule->rpool.list) == NULL))
 		error = EINVAL;
 
 	if (error) {
 		pf_free_rule(rule);
 		rule = NULL;
 		ERROUT(error);
 	}
 
 	rule->rpool.cur = TAILQ_FIRST(&rule->rpool.list);
 	TAILQ_INSERT_TAIL(ruleset->rules[rs_num].inactive.ptr,
 	    rule, entries);
 	ruleset->rules[rs_num].inactive.rcount++;
 
 	PF_RULES_WUNLOCK();
 	pf_hash_rule(rule);
 	if (RB_INSERT(pf_krule_global, ruleset->rules[rs_num].inactive.tree, rule) != NULL) {
 		PF_RULES_WLOCK();
 		TAILQ_REMOVE(ruleset->rules[rs_num].inactive.ptr, rule, entries);
 		ruleset->rules[rs_num].inactive.rcount--;
 		pf_free_rule(rule);
 		rule = NULL;
 		ERROUT(EEXIST);
 	}
 	PF_CONFIG_UNLOCK();
 
 	return (0);
 
 #undef ERROUT
 errout:
 	PF_RULES_WUNLOCK();
 	PF_CONFIG_UNLOCK();
 errout_unlocked:
 	pf_kkif_free(kif);
 	pf_krule_free(rule);
 	return (error);
 }
 
 static bool
 pf_label_match(const struct pf_krule *rule, const char *label)
 {
 	int i = 0;
 
 	while (*rule->label[i]) {
 		if (strcmp(rule->label[i], label) == 0)
 			return (true);
 		i++;
 	}
 
 	return (false);
 }
 
 static unsigned int
 pf_kill_matching_state(struct pf_state_key_cmp *key, int dir)
 {
 	struct pf_kstate *s;
 	int more = 0;
 
 	s = pf_find_state_all(key, dir, &more);
 	if (s == NULL)
 		return (0);
 
 	if (more) {
 		PF_STATE_UNLOCK(s);
 		return (0);
 	}
 
 	pf_unlink_state(s);
 	return (1);
 }
 
 static int
 pf_killstates_row(struct pf_kstate_kill *psk, struct pf_idhash *ih)
 {
 	struct pf_kstate	*s;
 	struct pf_state_key	*sk;
 	struct pf_addr		*srcaddr, *dstaddr;
 	struct pf_state_key_cmp	 match_key;
 	int			 idx, killed = 0;
 	unsigned int		 dir;
 	u_int16_t		 srcport, dstport;
 	struct pfi_kkif		*kif;
 
 relock_DIOCKILLSTATES:
 	PF_HASHROW_LOCK(ih);
 	LIST_FOREACH(s, &ih->states, entry) {
 		/* For floating states look at the original kif. */
 		kif = s->kif == V_pfi_all ? s->orig_kif : s->kif;
 
 		sk = s->key[PF_SK_WIRE];
 		if (s->direction == PF_OUT) {
 			srcaddr = &sk->addr[1];
 			dstaddr = &sk->addr[0];
 			srcport = sk->port[1];
 			dstport = sk->port[0];
 		} else {
 			srcaddr = &sk->addr[0];
 			dstaddr = &sk->addr[1];
 			srcport = sk->port[0];
 			dstport = sk->port[1];
 		}
 
 		if (psk->psk_af && sk->af != psk->psk_af)
 			continue;
 
 		if (psk->psk_proto && psk->psk_proto != sk->proto)
 			continue;
 
 		if (! PF_MATCHA(psk->psk_src.neg, &psk->psk_src.addr.v.a.addr,
 		    &psk->psk_src.addr.v.a.mask, srcaddr, sk->af))
 			continue;
 
 		if (! PF_MATCHA(psk->psk_dst.neg, &psk->psk_dst.addr.v.a.addr,
 		    &psk->psk_dst.addr.v.a.mask, dstaddr, sk->af))
 			continue;
 
 		if (!  PF_MATCHA(psk->psk_rt_addr.neg,
 		    &psk->psk_rt_addr.addr.v.a.addr,
 		    &psk->psk_rt_addr.addr.v.a.mask,
 		    &s->rt_addr, sk->af))
 			continue;
 
 		if (psk->psk_src.port_op != 0 &&
 		    ! pf_match_port(psk->psk_src.port_op,
 		    psk->psk_src.port[0], psk->psk_src.port[1], srcport))
 			continue;
 
 		if (psk->psk_dst.port_op != 0 &&
 		    ! pf_match_port(psk->psk_dst.port_op,
 		    psk->psk_dst.port[0], psk->psk_dst.port[1], dstport))
 			continue;
 
 		if (psk->psk_label[0] &&
 		    ! pf_label_match(s->rule.ptr, psk->psk_label))
 			continue;
 
 		if (psk->psk_ifname[0] && strcmp(psk->psk_ifname,
 		    kif->pfik_name))
 			continue;
 
 		if (psk->psk_kill_match) {
 			/* Create the key to find matching states, with lock
 			 * held. */
 
 			bzero(&match_key, sizeof(match_key));
 
 			if (s->direction == PF_OUT) {
 				dir = PF_IN;
 				idx = PF_SK_STACK;
 			} else {
 				dir = PF_OUT;
 				idx = PF_SK_WIRE;
 			}
 
 			match_key.af = s->key[idx]->af;
 			match_key.proto = s->key[idx]->proto;
 			PF_ACPY(&match_key.addr[0],
 			    &s->key[idx]->addr[1], match_key.af);
 			match_key.port[0] = s->key[idx]->port[1];
 			PF_ACPY(&match_key.addr[1],
 			    &s->key[idx]->addr[0], match_key.af);
 			match_key.port[1] = s->key[idx]->port[0];
 		}
 
 		pf_unlink_state(s);
 		killed++;
 
 		if (psk->psk_kill_match)
 			killed += pf_kill_matching_state(&match_key, dir);
 
 		goto relock_DIOCKILLSTATES;
 	}
 	PF_HASHROW_UNLOCK(ih);
 
 	return (killed);
 }
 
 static int
 pfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
 {
 	int			 error = 0;
 	PF_RULES_RLOCK_TRACKER;
 
 #define	ERROUT_IOCTL(target, x)					\
     do {								\
 	    error = (x);						\
 	    SDT_PROBE3(pf, ioctl, ioctl, error, cmd, error, __LINE__);	\
 	    goto target;						\
     } while (0)
 
 
 	/* XXX keep in sync with switch() below */
 	if (securelevel_gt(td->td_ucred, 2))
 		switch (cmd) {
 		case DIOCGETRULES:
 		case DIOCGETRULE:
 		case DIOCGETRULENV:
 		case DIOCGETADDRS:
 		case DIOCGETADDR:
 		case DIOCGETSTATE:
 		case DIOCGETSTATENV:
 		case DIOCSETSTATUSIF:
 		case DIOCGETSTATUS:
 		case DIOCGETSTATUSNV:
 		case DIOCCLRSTATUS:
 		case DIOCNATLOOK:
 		case DIOCSETDEBUG:
 		case DIOCGETSTATES:
 		case DIOCGETSTATESV2:
 		case DIOCGETTIMEOUT:
 		case DIOCCLRRULECTRS:
 		case DIOCGETLIMIT:
 		case DIOCGETALTQSV0:
 		case DIOCGETALTQSV1:
 		case DIOCGETALTQV0:
 		case DIOCGETALTQV1:
 		case DIOCGETQSTATSV0:
 		case DIOCGETQSTATSV1:
 		case DIOCGETRULESETS:
 		case DIOCGETRULESET:
 		case DIOCRGETTABLES:
 		case DIOCRGETTSTATS:
 		case DIOCRCLRTSTATS:
 		case DIOCRCLRADDRS:
 		case DIOCRADDADDRS:
 		case DIOCRDELADDRS:
 		case DIOCRSETADDRS:
 		case DIOCRGETADDRS:
 		case DIOCRGETASTATS:
 		case DIOCRCLRASTATS:
 		case DIOCRTSTADDRS:
 		case DIOCOSFPGET:
 		case DIOCGETSRCNODES:
 		case DIOCCLRSRCNODES:
 		case DIOCGETSYNCOOKIES:
 		case DIOCIGETIFACES:
 		case DIOCGIFSPEEDV0:
 		case DIOCGIFSPEEDV1:
 		case DIOCSETIFFLAG:
 		case DIOCCLRIFFLAG:
 		case DIOCGETETHRULES:
 		case DIOCGETETHRULE:
 		case DIOCGETETHRULESETS:
 		case DIOCGETETHRULESET:
 			break;
 		case DIOCRCLRTABLES:
 		case DIOCRADDTABLES:
 		case DIOCRDELTABLES:
 		case DIOCRSETTFLAGS:
 			if (((struct pfioc_table *)addr)->pfrio_flags &
 			    PFR_FLAG_DUMMY)
 				break; /* dummy operation ok */
 			return (EPERM);
 		default:
 			return (EPERM);
 		}
 
 	if (!(flags & FWRITE))
 		switch (cmd) {
 		case DIOCGETRULES:
 		case DIOCGETADDRS:
 		case DIOCGETADDR:
 		case DIOCGETSTATE:
 		case DIOCGETSTATENV:
 		case DIOCGETSTATUS:
 		case DIOCGETSTATUSNV:
 		case DIOCGETSTATES:
 		case DIOCGETSTATESV2:
 		case DIOCGETTIMEOUT:
 		case DIOCGETLIMIT:
 		case DIOCGETALTQSV0:
 		case DIOCGETALTQSV1:
 		case DIOCGETALTQV0:
 		case DIOCGETALTQV1:
 		case DIOCGETQSTATSV0:
 		case DIOCGETQSTATSV1:
 		case DIOCGETRULESETS:
 		case DIOCGETRULESET:
 		case DIOCNATLOOK:
 		case DIOCRGETTABLES:
 		case DIOCRGETTSTATS:
 		case DIOCRGETADDRS:
 		case DIOCRGETASTATS:
 		case DIOCRTSTADDRS:
 		case DIOCOSFPGET:
 		case DIOCGETSRCNODES:
 		case DIOCGETSYNCOOKIES:
 		case DIOCIGETIFACES:
 		case DIOCGIFSPEEDV1:
 		case DIOCGIFSPEEDV0:
 		case DIOCGETRULENV:
 		case DIOCGETETHRULES:
 		case DIOCGETETHRULE:
 		case DIOCGETETHRULESETS:
 		case DIOCGETETHRULESET:
 			break;
 		case DIOCRCLRTABLES:
 		case DIOCRADDTABLES:
 		case DIOCRDELTABLES:
 		case DIOCRCLRTSTATS:
 		case DIOCRCLRADDRS:
 		case DIOCRADDADDRS:
 		case DIOCRDELADDRS:
 		case DIOCRSETADDRS:
 		case DIOCRSETTFLAGS:
 			if (((struct pfioc_table *)addr)->pfrio_flags &
 			    PFR_FLAG_DUMMY) {
 				flags |= FWRITE; /* need write lock for dummy */
 				break; /* dummy operation ok */
 			}
 			return (EACCES);
 		case DIOCGETRULE:
 			if (((struct pfioc_rule *)addr)->action ==
 			    PF_GET_CLR_CNTR)
 				return (EACCES);
 			break;
 		default:
 			return (EACCES);
 		}
 
 	CURVNET_SET(TD_TO_VNET(td));
 
 	switch (cmd) {
 	case DIOCSTART:
 		sx_xlock(&pf_ioctl_lock);
 		if (V_pf_status.running)
 			error = EEXIST;
 		else {
 			hook_pf();
 			if (! TAILQ_EMPTY(V_pf_keth->active.rules))
 				hook_pf_eth();
 			V_pf_status.running = 1;
 			V_pf_status.since = time_second;
 			new_unrhdr64(&V_pf_stateid, time_second);
 
 			DPFPRINTF(PF_DEBUG_MISC, ("pf: started\n"));
 		}
 		break;
 
 	case DIOCSTOP:
 		sx_xlock(&pf_ioctl_lock);
 		if (!V_pf_status.running)
 			error = ENOENT;
 		else {
 			V_pf_status.running = 0;
 			dehook_pf();
 			dehook_pf_eth();
 			V_pf_status.since = time_second;
 			DPFPRINTF(PF_DEBUG_MISC, ("pf: stopped\n"));
 		}
 		break;
 
 	case DIOCGETETHRULES: {
 		struct pfioc_nv		*nv = (struct pfioc_nv *)addr;
 		nvlist_t		*nvl;
 		void			*packed;
 		struct pf_keth_rule	*tail;
 		struct pf_keth_ruleset	*rs;
 		u_int32_t		 ticket, nr;
 		const char		*anchor = "";
 
 		nvl = NULL;
 		packed = NULL;
 
 #define	ERROUT(x)	ERROUT_IOCTL(DIOCGETETHRULES_error, x)
 
 		if (nv->len > pf_ioctl_maxcount)
 			ERROUT(ENOMEM);
 
 		/* Copy the request in */
 		packed = malloc(nv->len, M_NVLIST, M_WAITOK);
 		if (packed == NULL)
 			ERROUT(ENOMEM);
 
 		error = copyin(nv->data, packed, nv->len);
 		if (error)
 			ERROUT(error);
 
 		nvl = nvlist_unpack(packed, nv->len, 0);
 		if (nvl == NULL)
 			ERROUT(EBADMSG);
 
 		if (! nvlist_exists_string(nvl, "anchor"))
 			ERROUT(EBADMSG);
 
 		anchor = nvlist_get_string(nvl, "anchor");
 
 		rs = pf_find_keth_ruleset(anchor);
 
 		nvlist_destroy(nvl);
 		nvl = NULL;
 		free(packed, M_NVLIST);
 		packed = NULL;
 
 		if (rs == NULL)
 			ERROUT(ENOENT);
 
 		/* Reply */
 		nvl = nvlist_create(0);
 		if (nvl == NULL)
 			ERROUT(ENOMEM);
 
 		PF_RULES_RLOCK();
 
 		ticket = rs->active.ticket;
 		tail = TAILQ_LAST(rs->active.rules, pf_keth_ruleq);
 		if (tail)
 			nr = tail->nr + 1;
 		else
 			nr = 0;
 
 		PF_RULES_RUNLOCK();
 
 		nvlist_add_number(nvl, "ticket", ticket);
 		nvlist_add_number(nvl, "nr", nr);
 
 		packed = nvlist_pack(nvl, &nv->len);
 		if (packed == NULL)
 			ERROUT(ENOMEM);
 
 		if (nv->size == 0)
 			ERROUT(0);
 		else if (nv->size < nv->len)
 			ERROUT(ENOSPC);
 
 		error = copyout(packed, nv->data, nv->len);
 
 #undef ERROUT
 DIOCGETETHRULES_error:
 		free(packed, M_NVLIST);
 		nvlist_destroy(nvl);
 		break;
 	}
 
 	case DIOCGETETHRULE: {
 		struct epoch_tracker	 et;
 		struct pfioc_nv		*nv = (struct pfioc_nv *)addr;
 		nvlist_t		*nvl = NULL;
 		void			*nvlpacked = NULL;
 		struct pf_keth_rule	*rule = NULL;
 		struct pf_keth_ruleset	*rs;
 		u_int32_t		 ticket, nr;
 		bool			 clear = false;
 		const char		*anchor;
 
 #define ERROUT(x)	ERROUT_IOCTL(DIOCGETETHRULE_error, x)
 
 		if (nv->len > pf_ioctl_maxcount)
 			ERROUT(ENOMEM);
 
 		nvlpacked = malloc(nv->len, M_NVLIST, M_WAITOK);
 		if (nvlpacked == NULL)
 			ERROUT(ENOMEM);
 
 		error = copyin(nv->data, nvlpacked, nv->len);
 		if (error)
 			ERROUT(error);
 
 		nvl = nvlist_unpack(nvlpacked, nv->len, 0);
 		if (nvl == NULL)
 			ERROUT(EBADMSG);
 		if (! nvlist_exists_number(nvl, "ticket"))
 			ERROUT(EBADMSG);
 		ticket = nvlist_get_number(nvl, "ticket");
 		if (! nvlist_exists_string(nvl, "anchor"))
 			ERROUT(EBADMSG);
 		anchor = nvlist_get_string(nvl, "anchor");
 
 		if (nvlist_exists_bool(nvl, "clear"))
 			clear = nvlist_get_bool(nvl, "clear");
 
 		if (clear && !(flags & FWRITE))
 			ERROUT(EACCES);
 
 		if (! nvlist_exists_number(nvl, "nr"))
 			ERROUT(EBADMSG);
 		nr = nvlist_get_number(nvl, "nr");
 
 		PF_RULES_RLOCK();
 		rs = pf_find_keth_ruleset(anchor);
 		if (rs == NULL) {
 			PF_RULES_RUNLOCK();
 			ERROUT(ENOENT);
 		}
 		if (ticket != rs->active.ticket) {
 			PF_RULES_RUNLOCK();
 			ERROUT(EBUSY);
 		}
 
 		nvlist_destroy(nvl);
 		nvl = NULL;
 		free(nvlpacked, M_NVLIST);
 		nvlpacked = NULL;
 
 		rule = TAILQ_FIRST(rs->active.rules);
 		while ((rule != NULL) && (rule->nr != nr))
 			rule = TAILQ_NEXT(rule, entries);
 		if (rule == NULL) {
 			PF_RULES_RUNLOCK();
 			ERROUT(ENOENT);
 		}
 		/* Make sure rule can't go away. */
 		NET_EPOCH_ENTER(et);
 		PF_RULES_RUNLOCK();
 		nvl = pf_keth_rule_to_nveth_rule(rule);
 		if (pf_keth_anchor_nvcopyout(rs, rule, nvl))
 			ERROUT(EBUSY);
 		NET_EPOCH_EXIT(et);
 		if (nvl == NULL)
 			ERROUT(ENOMEM);
 
 		nvlpacked = nvlist_pack(nvl, &nv->len);
 		if (nvlpacked == NULL)
 			ERROUT(ENOMEM);
 
 		if (nv->size == 0)
 			ERROUT(0);
 		else if (nv->size < nv->len)
 			ERROUT(ENOSPC);
 
 		error = copyout(nvlpacked, nv->data, nv->len);
 		if (error == 0 && clear) {
 			counter_u64_zero(rule->evaluations);
 			for (int i = 0; i < 2; i++) {
 				counter_u64_zero(rule->packets[i]);
 				counter_u64_zero(rule->bytes[i]);
 			}
 		}
 
 #undef ERROUT
 DIOCGETETHRULE_error:
 		free(nvlpacked, M_NVLIST);
 		nvlist_destroy(nvl);
 		break;
 	}
 
 	case DIOCADDETHRULE: {
 		struct pfioc_nv		*nv = (struct pfioc_nv *)addr;
 		nvlist_t		*nvl = NULL;
 		void			*nvlpacked = NULL;
 		struct pf_keth_rule	*rule = NULL, *tail = NULL;
 		struct pf_keth_ruleset	*ruleset = NULL;
 		struct pfi_kkif		*kif = NULL, *bridge_to_kif = NULL;
 		const char		*anchor = "", *anchor_call = "";
 
 #define ERROUT(x)	ERROUT_IOCTL(DIOCADDETHRULE_error, x)
 
 		if (nv->len > pf_ioctl_maxcount)
 			ERROUT(ENOMEM);
 
 		nvlpacked = malloc(nv->len, M_NVLIST, M_WAITOK);
 		if (nvlpacked == NULL)
 			ERROUT(ENOMEM);
 
 		error = copyin(nv->data, nvlpacked, nv->len);
 		if (error)
 			ERROUT(error);
 
 		nvl = nvlist_unpack(nvlpacked, nv->len, 0);
 		if (nvl == NULL)
 			ERROUT(EBADMSG);
 
 		if (! nvlist_exists_number(nvl, "ticket"))
 			ERROUT(EBADMSG);
 
 		if (nvlist_exists_string(nvl, "anchor"))
 			anchor = nvlist_get_string(nvl, "anchor");
 		if (nvlist_exists_string(nvl, "anchor_call"))
 			anchor_call = nvlist_get_string(nvl, "anchor_call");
 
 		ruleset = pf_find_keth_ruleset(anchor);
 		if (ruleset == NULL)
 			ERROUT(EINVAL);
 
 		if (nvlist_get_number(nvl, "ticket") !=
 		    ruleset->inactive.ticket) {
 			DPFPRINTF(PF_DEBUG_MISC,
 			    ("ticket: %d != %d\n",
 			    (u_int32_t)nvlist_get_number(nvl, "ticket"),
 			    ruleset->inactive.ticket));
 			ERROUT(EBUSY);
 		}
 
 		rule = malloc(sizeof(*rule), M_PFRULE, M_WAITOK);
 		if (rule == NULL)
 			ERROUT(ENOMEM);
 		rule->timestamp = NULL;
 
 		error = pf_nveth_rule_to_keth_rule(nvl, rule);
 		if (error != 0)
 			ERROUT(error);
 
 		if (rule->ifname[0])
 			kif = pf_kkif_create(M_WAITOK);
 		if (rule->bridge_to_name[0])
 			bridge_to_kif = pf_kkif_create(M_WAITOK);
 		rule->evaluations = counter_u64_alloc(M_WAITOK);
 		for (int i = 0; i < 2; i++) {
 			rule->packets[i] = counter_u64_alloc(M_WAITOK);
 			rule->bytes[i] = counter_u64_alloc(M_WAITOK);
 		}
 		rule->timestamp = uma_zalloc_pcpu(pf_timestamp_pcpu_zone,
 		    M_WAITOK | M_ZERO);
 
 		PF_RULES_WLOCK();
 
 		if (rule->ifname[0]) {
 			rule->kif = pfi_kkif_attach(kif, rule->ifname);
 			pfi_kkif_ref(rule->kif);
 		} else
 			rule->kif = NULL;
 		if (rule->bridge_to_name[0]) {
 			rule->bridge_to = pfi_kkif_attach(bridge_to_kif,
 			    rule->bridge_to_name);
 			pfi_kkif_ref(rule->bridge_to);
 		} else
 			rule->bridge_to = NULL;
 
 #ifdef ALTQ
 		/* set queue IDs */
 		if (rule->qname[0] != 0) {
 			if ((rule->qid = pf_qname2qid(rule->qname)) == 0)
 				error = EBUSY;
 			else
 				rule->qid = rule->qid;
 		}
 #endif
 		if (rule->tagname[0])
 			if ((rule->tag = pf_tagname2tag(rule->tagname)) == 0)
 				error = EBUSY;
 		if (rule->match_tagname[0])
 			if ((rule->match_tag = pf_tagname2tag(
 			    rule->match_tagname)) == 0)
 				error = EBUSY;
 
 		if (error == 0 && rule->ipdst.addr.type == PF_ADDR_TABLE)
 			error = pf_eth_addr_setup(ruleset, &rule->ipdst.addr);
 		if (error == 0 && rule->ipsrc.addr.type == PF_ADDR_TABLE)
 			error = pf_eth_addr_setup(ruleset, &rule->ipsrc.addr);
 
 		if (error) {
 			pf_free_eth_rule(rule);
 			PF_RULES_WUNLOCK();
 			ERROUT(error);
 		}
 
 		if (pf_keth_anchor_setup(rule, ruleset, anchor_call)) {
 			pf_free_eth_rule(rule);
 			PF_RULES_WUNLOCK();
 			ERROUT(EINVAL);
 		}
 
 		tail = TAILQ_LAST(ruleset->inactive.rules, pf_keth_ruleq);
 		if (tail)
 			rule->nr = tail->nr + 1;
 		else
 			rule->nr = 0;
 
 		TAILQ_INSERT_TAIL(ruleset->inactive.rules, rule, entries);
 
 		PF_RULES_WUNLOCK();
 
 #undef ERROUT
 DIOCADDETHRULE_error:
 		nvlist_destroy(nvl);
 		free(nvlpacked, M_NVLIST);
 		break;
 	}
 
 	case DIOCGETETHRULESETS: {
 		struct epoch_tracker	 et;
 		struct pfioc_nv		*nv = (struct pfioc_nv *)addr;
 		nvlist_t		*nvl = NULL;
 		void			*nvlpacked = NULL;
 		struct pf_keth_ruleset	*ruleset;
 		struct pf_keth_anchor	*anchor;
 		int			 nr = 0;
 
 #define ERROUT(x)	ERROUT_IOCTL(DIOCGETETHRULESETS_error, x)
 
 		if (nv->len > pf_ioctl_maxcount)
 			ERROUT(ENOMEM);
 
 		nvlpacked = malloc(nv->len, M_NVLIST, M_WAITOK);
 		if (nvlpacked == NULL)
 			ERROUT(ENOMEM);
 
 		error = copyin(nv->data, nvlpacked, nv->len);
 		if (error)
 			ERROUT(error);
 
 		nvl = nvlist_unpack(nvlpacked, nv->len, 0);
 		if (nvl == NULL)
 			ERROUT(EBADMSG);
 		if (! nvlist_exists_string(nvl, "path"))
 			ERROUT(EBADMSG);
 
 		NET_EPOCH_ENTER(et);
 
 		if ((ruleset = pf_find_keth_ruleset(
 		    nvlist_get_string(nvl, "path"))) == NULL) {
 			NET_EPOCH_EXIT(et);
 			ERROUT(ENOENT);
 		}
 
 		if (ruleset->anchor == NULL) {
 			RB_FOREACH(anchor, pf_keth_anchor_global, &V_pf_keth_anchors)
 				if (anchor->parent == NULL)
 					nr++;
 		} else {
 			RB_FOREACH(anchor, pf_keth_anchor_node,
 			    &ruleset->anchor->children)
 				nr++;
 		}
 
 		NET_EPOCH_EXIT(et);
 
 		nvlist_destroy(nvl);
 		nvl = NULL;
 		free(nvlpacked, M_NVLIST);
 		nvlpacked = NULL;
 
 		nvl = nvlist_create(0);
 		if (nvl == NULL)
 			ERROUT(ENOMEM);
 
 		nvlist_add_number(nvl, "nr", nr);
 
 		nvlpacked = nvlist_pack(nvl, &nv->len);
 		if (nvlpacked == NULL)
 			ERROUT(ENOMEM);
 
 		if (nv->size == 0)
 			ERROUT(0);
 		else if (nv->size < nv->len)
 			ERROUT(ENOSPC);
 
 		error = copyout(nvlpacked, nv->data, nv->len);
 
 #undef ERROUT
 DIOCGETETHRULESETS_error:
 		free(nvlpacked, M_NVLIST);
 		nvlist_destroy(nvl);
 		break;
 	}
 
 	case DIOCGETETHRULESET: {
 		struct epoch_tracker	 et;
 		struct pfioc_nv		*nv = (struct pfioc_nv *)addr;
 		nvlist_t		*nvl = NULL;
 		void			*nvlpacked = NULL;
 		struct pf_keth_ruleset	*ruleset;
 		struct pf_keth_anchor	*anchor;
 		int			 nr = 0, req_nr = 0;
 		bool			 found = false;
 
 #define ERROUT(x)	ERROUT_IOCTL(DIOCGETETHRULESET_error, x)
 
 		if (nv->len > pf_ioctl_maxcount)
 			ERROUT(ENOMEM);
 
 		nvlpacked = malloc(nv->len, M_NVLIST, M_WAITOK);
 		if (nvlpacked == NULL)
 			ERROUT(ENOMEM);
 
 		error = copyin(nv->data, nvlpacked, nv->len);
 		if (error)
 			ERROUT(error);
 
 		nvl = nvlist_unpack(nvlpacked, nv->len, 0);
 		if (nvl == NULL)
 			ERROUT(EBADMSG);
 		if (! nvlist_exists_string(nvl, "path"))
 			ERROUT(EBADMSG);
 		if (! nvlist_exists_number(nvl, "nr"))
 			ERROUT(EBADMSG);
 
 		req_nr = nvlist_get_number(nvl, "nr");
 
 		NET_EPOCH_ENTER(et);
 
 		if ((ruleset = pf_find_keth_ruleset(
 		    nvlist_get_string(nvl, "path"))) == NULL) {
 			NET_EPOCH_EXIT(et);
 			ERROUT(ENOENT);
 		}
 
 		nvlist_destroy(nvl);
 		nvl = NULL;
 		free(nvlpacked, M_NVLIST);
 		nvlpacked = NULL;
 
 		nvl = nvlist_create(0);
 		if (nvl == NULL) {
 			NET_EPOCH_EXIT(et);
 			ERROUT(ENOMEM);
 		}
 
 		if (ruleset->anchor == NULL) {
 			RB_FOREACH(anchor, pf_keth_anchor_global,
 			    &V_pf_keth_anchors) {
 				if (anchor->parent == NULL && nr++ == req_nr) {
 					found = true;
 					break;
 				}
 			}
 		} else {
 			RB_FOREACH(anchor, pf_keth_anchor_node,
 			     &ruleset->anchor->children) {
 				if (nr++ == req_nr) {
 					found = true;
 					break;
 				}
 			}
 		}
 
 		NET_EPOCH_EXIT(et);
 		if (found) {
 			nvlist_add_number(nvl, "nr", nr);
 			nvlist_add_string(nvl, "name", anchor->name);
 			if (ruleset->anchor)
 				nvlist_add_string(nvl, "path",
 				    ruleset->anchor->path);
 			else
 				nvlist_add_string(nvl, "path", "");
 		} else {
 			ERROUT(EBUSY);
 		}
 
 		nvlpacked = nvlist_pack(nvl, &nv->len);
 		if (nvlpacked == NULL)
 			ERROUT(ENOMEM);
 
 		if (nv->size == 0)
 			ERROUT(0);
 		else if (nv->size < nv->len)
 			ERROUT(ENOSPC);
 
 		error = copyout(nvlpacked, nv->data, nv->len);
 
 #undef ERROUT
 DIOCGETETHRULESET_error:
 		free(nvlpacked, M_NVLIST);
 		nvlist_destroy(nvl);
 		break;
 	}
 
 	case DIOCADDRULENV: {
 		struct pfioc_nv	*nv = (struct pfioc_nv *)addr;
 		nvlist_t	*nvl = NULL;
 		void		*nvlpacked = NULL;
 		struct pf_krule	*rule = NULL;
 		const char	*anchor = "", *anchor_call = "";
 		uint32_t	 ticket = 0, pool_ticket = 0;
 
 #define	ERROUT(x)	ERROUT_IOCTL(DIOCADDRULENV_error, x)
 
 		if (nv->len > pf_ioctl_maxcount)
 			ERROUT(ENOMEM);
 
 		nvlpacked = malloc(nv->len, M_NVLIST, M_WAITOK);
 		error = copyin(nv->data, nvlpacked, nv->len);
 		if (error)
 			ERROUT(error);
 
 		nvl = nvlist_unpack(nvlpacked, nv->len, 0);
 		if (nvl == NULL)
 			ERROUT(EBADMSG);
 
 		if (! nvlist_exists_number(nvl, "ticket"))
 			ERROUT(EINVAL);
 		ticket = nvlist_get_number(nvl, "ticket");
 
 		if (! nvlist_exists_number(nvl, "pool_ticket"))
 			ERROUT(EINVAL);
 		pool_ticket = nvlist_get_number(nvl, "pool_ticket");
 
 		if (! nvlist_exists_nvlist(nvl, "rule"))
 			ERROUT(EINVAL);
 
 		rule = pf_krule_alloc();
 		error = pf_nvrule_to_krule(nvlist_get_nvlist(nvl, "rule"),
 		    rule);
 		if (error)
 			ERROUT(error);
 
 		if (nvlist_exists_string(nvl, "anchor"))
 			anchor = nvlist_get_string(nvl, "anchor");
 		if (nvlist_exists_string(nvl, "anchor_call"))
 			anchor_call = nvlist_get_string(nvl, "anchor_call");
 
 		if ((error = nvlist_error(nvl)))
 			ERROUT(error);
 
 		/* Frees rule on error */
 		error = pf_ioctl_addrule(rule, ticket, pool_ticket, anchor,
 		    anchor_call, td);
 
 		nvlist_destroy(nvl);
 		free(nvlpacked, M_NVLIST);
 		break;
 #undef ERROUT
 DIOCADDRULENV_error:
 		pf_krule_free(rule);
 		nvlist_destroy(nvl);
 		free(nvlpacked, M_NVLIST);
 
 		break;
 	}
 	case DIOCADDRULE: {
 		struct pfioc_rule	*pr = (struct pfioc_rule *)addr;
 		struct pf_krule		*rule;
 
 		rule = pf_krule_alloc();
 		error = pf_rule_to_krule(&pr->rule, rule);
 		if (error != 0) {
 			pf_krule_free(rule);
 			break;
 		}
 
 		pr->anchor[sizeof(pr->anchor) - 1] = 0;
 
 		/* Frees rule on error */
 		error = pf_ioctl_addrule(rule, pr->ticket, pr->pool_ticket,
 		    pr->anchor, pr->anchor_call, td);
 		break;
 	}
 
 	case DIOCGETRULES: {
 		struct pfioc_rule	*pr = (struct pfioc_rule *)addr;
 		struct pf_kruleset	*ruleset;
 		struct pf_krule		*tail;
 		int			 rs_num;
 
 		pr->anchor[sizeof(pr->anchor) - 1] = 0;
 
 		PF_RULES_WLOCK();
 		ruleset = pf_find_kruleset(pr->anchor);
 		if (ruleset == NULL) {
 			PF_RULES_WUNLOCK();
 			error = EINVAL;
 			break;
 		}
 		rs_num = pf_get_ruleset_number(pr->rule.action);
 		if (rs_num >= PF_RULESET_MAX) {
 			PF_RULES_WUNLOCK();
 			error = EINVAL;
 			break;
 		}
 		tail = TAILQ_LAST(ruleset->rules[rs_num].active.ptr,
 		    pf_krulequeue);
 		if (tail)
 			pr->nr = tail->nr + 1;
 		else
 			pr->nr = 0;
 		pr->ticket = ruleset->rules[rs_num].active.ticket;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCGETRULE: {
 		struct pfioc_rule	*pr = (struct pfioc_rule *)addr;
 		struct pf_kruleset	*ruleset;
 		struct pf_krule		*rule;
 		int			 rs_num;
 
 		pr->anchor[sizeof(pr->anchor) - 1] = 0;
 
 		PF_RULES_WLOCK();
 		ruleset = pf_find_kruleset(pr->anchor);
 		if (ruleset == NULL) {
 			PF_RULES_WUNLOCK();
 			error = EINVAL;
 			break;
 		}
 		rs_num = pf_get_ruleset_number(pr->rule.action);
 		if (rs_num >= PF_RULESET_MAX) {
 			PF_RULES_WUNLOCK();
 			error = EINVAL;
 			break;
 		}
 		if (pr->ticket != ruleset->rules[rs_num].active.ticket) {
 			PF_RULES_WUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr);
 		while ((rule != NULL) && (rule->nr != pr->nr))
 			rule = TAILQ_NEXT(rule, entries);
 		if (rule == NULL) {
 			PF_RULES_WUNLOCK();
 			error = EBUSY;
 			break;
 		}
 
 		pf_krule_to_rule(rule, &pr->rule);
 
 		if (pf_kanchor_copyout(ruleset, rule, pr)) {
 			PF_RULES_WUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		pf_addr_copyout(&pr->rule.src.addr);
 		pf_addr_copyout(&pr->rule.dst.addr);
 
 		if (pr->action == PF_GET_CLR_CNTR) {
 			pf_counter_u64_zero(&rule->evaluations);
 			for (int i = 0; i < 2; i++) {
 				pf_counter_u64_zero(&rule->packets[i]);
 				pf_counter_u64_zero(&rule->bytes[i]);
 			}
 			counter_u64_zero(rule->states_tot);
 		}
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCGETRULENV: {
 		struct pfioc_nv		*nv = (struct pfioc_nv *)addr;
 		nvlist_t		*nvrule = NULL;
 		nvlist_t		*nvl = NULL;
 		struct pf_kruleset	*ruleset;
 		struct pf_krule		*rule;
 		void			*nvlpacked = NULL;
 		int			 rs_num, nr;
 		bool			 clear_counter = false;
 
 #define	ERROUT(x)	ERROUT_IOCTL(DIOCGETRULENV_error, x)
 
 		if (nv->len > pf_ioctl_maxcount)
 			ERROUT(ENOMEM);
 
 		/* Copy the request in */
 		nvlpacked = malloc(nv->len, M_NVLIST, M_WAITOK);
 		if (nvlpacked == NULL)
 			ERROUT(ENOMEM);
 
 		error = copyin(nv->data, nvlpacked, nv->len);
 		if (error)
 			ERROUT(error);
 
 		nvl = nvlist_unpack(nvlpacked, nv->len, 0);
 		if (nvl == NULL)
 			ERROUT(EBADMSG);
 
 		if (! nvlist_exists_string(nvl, "anchor"))
 			ERROUT(EBADMSG);
 		if (! nvlist_exists_number(nvl, "ruleset"))
 			ERROUT(EBADMSG);
 		if (! nvlist_exists_number(nvl, "ticket"))
 			ERROUT(EBADMSG);
 		if (! nvlist_exists_number(nvl, "nr"))
 			ERROUT(EBADMSG);
 
 		if (nvlist_exists_bool(nvl, "clear_counter"))
 			clear_counter = nvlist_get_bool(nvl, "clear_counter");
 
 		if (clear_counter && !(flags & FWRITE))
 			ERROUT(EACCES);
 
 		nr = nvlist_get_number(nvl, "nr");
 
 		PF_RULES_WLOCK();
 		ruleset = pf_find_kruleset(nvlist_get_string(nvl, "anchor"));
 		if (ruleset == NULL) {
 			PF_RULES_WUNLOCK();
 			ERROUT(ENOENT);
 		}
 
 		rs_num = pf_get_ruleset_number(nvlist_get_number(nvl, "ruleset"));
 		if (rs_num >= PF_RULESET_MAX) {
 			PF_RULES_WUNLOCK();
 			ERROUT(EINVAL);
 		}
 
 		if (nvlist_get_number(nvl, "ticket") !=
 		    ruleset->rules[rs_num].active.ticket) {
 			PF_RULES_WUNLOCK();
 			ERROUT(EBUSY);
 		}
 
 		if ((error = nvlist_error(nvl))) {
 			PF_RULES_WUNLOCK();
 			ERROUT(error);
 		}
 
 		rule = TAILQ_FIRST(ruleset->rules[rs_num].active.ptr);
 		while ((rule != NULL) && (rule->nr != nr))
 			rule = TAILQ_NEXT(rule, entries);
 		if (rule == NULL) {
 			PF_RULES_WUNLOCK();
 			ERROUT(EBUSY);
 		}
 
 		nvrule = pf_krule_to_nvrule(rule);
 
 		nvlist_destroy(nvl);
 		nvl = nvlist_create(0);
 		if (nvl == NULL) {
 			PF_RULES_WUNLOCK();
 			ERROUT(ENOMEM);
 		}
 		nvlist_add_number(nvl, "nr", nr);
 		nvlist_add_nvlist(nvl, "rule", nvrule);
 		nvlist_destroy(nvrule);
 		nvrule = NULL;
 		if (pf_kanchor_nvcopyout(ruleset, rule, nvl)) {
 			PF_RULES_WUNLOCK();
 			ERROUT(EBUSY);
 		}
 
 		free(nvlpacked, M_NVLIST);
 		nvlpacked = nvlist_pack(nvl, &nv->len);
 		if (nvlpacked == NULL) {
 			PF_RULES_WUNLOCK();
 			ERROUT(ENOMEM);
 		}
 
 		if (nv->size == 0) {
 			PF_RULES_WUNLOCK();
 			ERROUT(0);
 		}
 		else if (nv->size < nv->len) {
 			PF_RULES_WUNLOCK();
 			ERROUT(ENOSPC);
 		}
 
 		if (clear_counter) {
 			pf_counter_u64_zero(&rule->evaluations);
 			for (int i = 0; i < 2; i++) {
 				pf_counter_u64_zero(&rule->packets[i]);
 				pf_counter_u64_zero(&rule->bytes[i]);
 			}
 			counter_u64_zero(rule->states_tot);
 		}
 		PF_RULES_WUNLOCK();
 
 		error = copyout(nvlpacked, nv->data, nv->len);
 
 #undef ERROUT
 DIOCGETRULENV_error:
 		free(nvlpacked, M_NVLIST);
 		nvlist_destroy(nvrule);
 		nvlist_destroy(nvl);
 
 		break;
 	}
 
 	case DIOCCHANGERULE: {
 		struct pfioc_rule	*pcr = (struct pfioc_rule *)addr;
 		struct pf_kruleset	*ruleset;
 		struct pf_krule		*oldrule = NULL, *newrule = NULL;
 		struct pfi_kkif		*kif = NULL;
 		struct pf_kpooladdr	*pa;
 		u_int32_t		 nr = 0;
 		int			 rs_num;
 
 		pcr->anchor[sizeof(pcr->anchor) - 1] = 0;
 
 		if (pcr->action < PF_CHANGE_ADD_HEAD ||
 		    pcr->action > PF_CHANGE_GET_TICKET) {
 			error = EINVAL;
 			break;
 		}
 		if (pcr->rule.return_icmp >> 8 > ICMP_MAXTYPE) {
 			error = EINVAL;
 			break;
 		}
 
 		if (pcr->action != PF_CHANGE_REMOVE) {
 			newrule = pf_krule_alloc();
 			error = pf_rule_to_krule(&pcr->rule, newrule);
 			if (error != 0) {
 				pf_krule_free(newrule);
 				break;
 			}
 
 			if (newrule->ifname[0])
 				kif = pf_kkif_create(M_WAITOK);
 			pf_counter_u64_init(&newrule->evaluations, M_WAITOK);
 			for (int i = 0; i < 2; i++) {
 				pf_counter_u64_init(&newrule->packets[i], M_WAITOK);
 				pf_counter_u64_init(&newrule->bytes[i], M_WAITOK);
 			}
 			newrule->states_cur = counter_u64_alloc(M_WAITOK);
 			newrule->states_tot = counter_u64_alloc(M_WAITOK);
 			newrule->src_nodes = counter_u64_alloc(M_WAITOK);
 			newrule->cuid = td->td_ucred->cr_ruid;
 			newrule->cpid = td->td_proc ? td->td_proc->p_pid : 0;
 			TAILQ_INIT(&newrule->rpool.list);
 		}
 #define	ERROUT(x)	ERROUT_IOCTL(DIOCCHANGERULE_error, x)
 
 		PF_CONFIG_LOCK();
 		PF_RULES_WLOCK();
 #ifdef PF_WANT_32_TO_64_COUNTER
 		if (newrule != NULL) {
 			LIST_INSERT_HEAD(&V_pf_allrulelist, newrule, allrulelist);
 			newrule->allrulelinked = true;
 			V_pf_allrulecount++;
 		}
 #endif
 
 		if (!(pcr->action == PF_CHANGE_REMOVE ||
 		    pcr->action == PF_CHANGE_GET_TICKET) &&
 		    pcr->pool_ticket != V_ticket_pabuf)
 			ERROUT(EBUSY);
 
 		ruleset = pf_find_kruleset(pcr->anchor);
 		if (ruleset == NULL)
 			ERROUT(EINVAL);
 
 		rs_num = pf_get_ruleset_number(pcr->rule.action);
 		if (rs_num >= PF_RULESET_MAX)
 			ERROUT(EINVAL);
 
 		/*
 		 * XXXMJG: there is no guarantee that the ruleset was
 		 * created by the usual route of calling DIOCXBEGIN.
 		 * As a result it is possible the rule tree will not
 		 * be allocated yet. Hack around it by doing it here.
 		 * Note it is fine to let the tree persist in case of
 		 * error as it will be freed down the road on future
 		 * updates (if need be).
 		 */
 		if (ruleset->rules[rs_num].active.tree == NULL) {
 			ruleset->rules[rs_num].active.tree = pf_rule_tree_alloc(M_NOWAIT);
 			if (ruleset->rules[rs_num].active.tree == NULL) {
 				ERROUT(ENOMEM);
 			}
 		}
 
 		if (pcr->action == PF_CHANGE_GET_TICKET) {
 			pcr->ticket = ++ruleset->rules[rs_num].active.ticket;
 			ERROUT(0);
 		} else if (pcr->ticket !=
 			    ruleset->rules[rs_num].active.ticket)
 				ERROUT(EINVAL);
 
 		if (pcr->action != PF_CHANGE_REMOVE) {
 			if (newrule->ifname[0]) {
 				newrule->kif = pfi_kkif_attach(kif,
 				    newrule->ifname);
 				kif = NULL;
 				pfi_kkif_ref(newrule->kif);
 			} else
 				newrule->kif = NULL;
 
 			if (newrule->rtableid > 0 &&
 			    newrule->rtableid >= rt_numfibs)
 				error = EBUSY;
 
 #ifdef ALTQ
 			/* set queue IDs */
 			if (newrule->qname[0] != 0) {
 				if ((newrule->qid =
 				    pf_qname2qid(newrule->qname)) == 0)
 					error = EBUSY;
 				else if (newrule->pqname[0] != 0) {
 					if ((newrule->pqid =
 					    pf_qname2qid(newrule->pqname)) == 0)
 						error = EBUSY;
 				} else
 					newrule->pqid = newrule->qid;
 			}
 #endif /* ALTQ */
 			if (newrule->tagname[0])
 				if ((newrule->tag =
 				    pf_tagname2tag(newrule->tagname)) == 0)
 					error = EBUSY;
 			if (newrule->match_tagname[0])
 				if ((newrule->match_tag = pf_tagname2tag(
 				    newrule->match_tagname)) == 0)
 					error = EBUSY;
 			if (newrule->rt && !newrule->direction)
 				error = EINVAL;
 			if (!newrule->log)
 				newrule->logif = 0;
 			if (newrule->logif >= PFLOGIFS_MAX)
 				error = EINVAL;
 			if (pf_addr_setup(ruleset, &newrule->src.addr, newrule->af))
 				error = ENOMEM;
 			if (pf_addr_setup(ruleset, &newrule->dst.addr, newrule->af))
 				error = ENOMEM;
 			if (pf_kanchor_setup(newrule, ruleset, pcr->anchor_call))
 				error = EINVAL;
 			TAILQ_FOREACH(pa, &V_pf_pabuf, entries)
 				if (pa->addr.type == PF_ADDR_TABLE) {
 					pa->addr.p.tbl =
 					    pfr_attach_table(ruleset,
 					    pa->addr.v.tblname);
 					if (pa->addr.p.tbl == NULL)
 						error = ENOMEM;
 				}
 
 			newrule->overload_tbl = NULL;
 			if (newrule->overload_tblname[0]) {
 				if ((newrule->overload_tbl = pfr_attach_table(
 				    ruleset, newrule->overload_tblname)) ==
 				    NULL)
 					error = EINVAL;
 				else
 					newrule->overload_tbl->pfrkt_flags |=
 					    PFR_TFLAG_ACTIVE;
 			}
 
 			pf_mv_kpool(&V_pf_pabuf, &newrule->rpool.list);
 			if (((((newrule->action == PF_NAT) ||
 			    (newrule->action == PF_RDR) ||
 			    (newrule->action == PF_BINAT) ||
 			    (newrule->rt > PF_NOPFROUTE)) &&
 			    !newrule->anchor)) &&
 			    (TAILQ_FIRST(&newrule->rpool.list) == NULL))
 				error = EINVAL;
 
 			if (error) {
 				pf_free_rule(newrule);
 				PF_RULES_WUNLOCK();
 				PF_CONFIG_UNLOCK();
 				break;
 			}
 
 			newrule->rpool.cur = TAILQ_FIRST(&newrule->rpool.list);
 		}
 		pf_empty_kpool(&V_pf_pabuf);
 
 		if (pcr->action == PF_CHANGE_ADD_HEAD)
 			oldrule = TAILQ_FIRST(
 			    ruleset->rules[rs_num].active.ptr);
 		else if (pcr->action == PF_CHANGE_ADD_TAIL)
 			oldrule = TAILQ_LAST(
 			    ruleset->rules[rs_num].active.ptr, pf_krulequeue);
 		else {
 			oldrule = TAILQ_FIRST(
 			    ruleset->rules[rs_num].active.ptr);
 			while ((oldrule != NULL) && (oldrule->nr != pcr->nr))
 				oldrule = TAILQ_NEXT(oldrule, entries);
 			if (oldrule == NULL) {
 				if (newrule != NULL)
 					pf_free_rule(newrule);
 				PF_RULES_WUNLOCK();
 				PF_CONFIG_UNLOCK();
 				error = EINVAL;
 				break;
 			}
 		}
 
 		if (pcr->action == PF_CHANGE_REMOVE) {
 			pf_unlink_rule(ruleset->rules[rs_num].active.ptr,
 			    oldrule);
 			RB_REMOVE(pf_krule_global,
 			    ruleset->rules[rs_num].active.tree, oldrule);
 			ruleset->rules[rs_num].active.rcount--;
 		} else {
 			pf_hash_rule(newrule);
 			if (RB_INSERT(pf_krule_global,
 			    ruleset->rules[rs_num].active.tree, newrule) != NULL) {
 				pf_free_rule(newrule);
 				PF_RULES_WUNLOCK();
 				PF_CONFIG_UNLOCK();
 				error = EEXIST;
 				break;
 			}
 
 			if (oldrule == NULL)
 				TAILQ_INSERT_TAIL(
 				    ruleset->rules[rs_num].active.ptr,
 				    newrule, entries);
 			else if (pcr->action == PF_CHANGE_ADD_HEAD ||
 			    pcr->action == PF_CHANGE_ADD_BEFORE)
 				TAILQ_INSERT_BEFORE(oldrule, newrule, entries);
 			else
 				TAILQ_INSERT_AFTER(
 				    ruleset->rules[rs_num].active.ptr,
 				    oldrule, newrule, entries);
 			ruleset->rules[rs_num].active.rcount++;
 		}
 
 		nr = 0;
 		TAILQ_FOREACH(oldrule,
 		    ruleset->rules[rs_num].active.ptr, entries)
 			oldrule->nr = nr++;
 
 		ruleset->rules[rs_num].active.ticket++;
 
 		pf_calc_skip_steps(ruleset->rules[rs_num].active.ptr);
 		pf_remove_if_empty_kruleset(ruleset);
 
 		PF_RULES_WUNLOCK();
 		PF_CONFIG_UNLOCK();
 		break;
 
 #undef ERROUT
 DIOCCHANGERULE_error:
 		PF_RULES_WUNLOCK();
 		PF_CONFIG_UNLOCK();
 		pf_krule_free(newrule);
 		pf_kkif_free(kif);
 		break;
 	}
 
 	case DIOCCLRSTATES: {
 		struct pfioc_state_kill *psk = (struct pfioc_state_kill *)addr;
 		struct pf_kstate_kill	 kill;
 
 		error = pf_state_kill_to_kstate_kill(psk, &kill);
 		if (error)
 			break;
 
 		psk->psk_killed = pf_clear_states(&kill);
 		break;
 	}
 
 	case DIOCCLRSTATESNV: {
 		error = pf_clearstates_nv((struct pfioc_nv *)addr);
 		break;
 	}
 
 	case DIOCKILLSTATES: {
 		struct pfioc_state_kill	*psk = (struct pfioc_state_kill *)addr;
 		struct pf_kstate_kill	 kill;
 
 		error = pf_state_kill_to_kstate_kill(psk, &kill);
 		if (error)
 			break;
 
 		psk->psk_killed = 0;
 		pf_killstates(&kill, &psk->psk_killed);
 		break;
 	}
 
 	case DIOCKILLSTATESNV: {
 		error = pf_killstates_nv((struct pfioc_nv *)addr);
 		break;
 	}
 
 	case DIOCADDSTATE: {
 		struct pfioc_state	*ps = (struct pfioc_state *)addr;
 		struct pfsync_state	*sp = &ps->state;
 
 		if (sp->timeout >= PFTM_MAX) {
 			error = EINVAL;
 			break;
 		}
 		if (V_pfsync_state_import_ptr != NULL) {
 			PF_RULES_RLOCK();
 			error = V_pfsync_state_import_ptr(sp, PFSYNC_SI_IOCTL);
 			PF_RULES_RUNLOCK();
 		} else
 			error = EOPNOTSUPP;
 		break;
 	}
 
 	case DIOCGETSTATE: {
 		struct pfioc_state	*ps = (struct pfioc_state *)addr;
 		struct pf_kstate	*s;
 
 		s = pf_find_state_byid(ps->state.id, ps->state.creatorid);
 		if (s == NULL) {
 			error = ENOENT;
 			break;
 		}
 
 		pfsync_state_export(&ps->state, s);
 		PF_STATE_UNLOCK(s);
 		break;
 	}
 
 	case DIOCGETSTATENV: {
 		error = pf_getstate((struct pfioc_nv *)addr);
 		break;
 	}
 
 	case DIOCGETSTATES: {
 		struct pfioc_states	*ps = (struct pfioc_states *)addr;
 		struct pf_kstate	*s;
 		struct pfsync_state	*pstore, *p;
 		int			 i, nr;
 		size_t			 slice_count = 16, count;
 		void			*out;
 
 		if (ps->ps_len <= 0) {
 			nr = uma_zone_get_cur(V_pf_state_z);
 			ps->ps_len = sizeof(struct pfsync_state) * nr;
 			break;
 		}
 
 		out = ps->ps_states;
 		pstore = mallocarray(slice_count,
 		    sizeof(struct pfsync_state), M_TEMP, M_WAITOK | M_ZERO);
 		nr = 0;
 
 		for (i = 0; i <= pf_hashmask; i++) {
 			struct pf_idhash *ih = &V_pf_idhash[i];
 
 DIOCGETSTATES_retry:
 			p = pstore;
 
 			if (LIST_EMPTY(&ih->states))
 				continue;
 
 			PF_HASHROW_LOCK(ih);
 			count = 0;
 			LIST_FOREACH(s, &ih->states, entry) {
 				if (s->timeout == PFTM_UNLINKED)
 					continue;
 				count++;
 			}
 
 			if (count > slice_count) {
 				PF_HASHROW_UNLOCK(ih);
 				free(pstore, M_TEMP);
 				slice_count = count * 2;
 				pstore = mallocarray(slice_count,
 				    sizeof(struct pfsync_state), M_TEMP,
 				    M_WAITOK | M_ZERO);
 				goto DIOCGETSTATES_retry;
 			}
 
 			if ((nr+count) * sizeof(*p) > ps->ps_len) {
 				PF_HASHROW_UNLOCK(ih);
 				goto DIOCGETSTATES_full;
 			}
 
 			LIST_FOREACH(s, &ih->states, entry) {
 				if (s->timeout == PFTM_UNLINKED)
 					continue;
 
 				pfsync_state_export(p, s);
 				p++;
 				nr++;
 			}
 			PF_HASHROW_UNLOCK(ih);
 			error = copyout(pstore, out,
 			    sizeof(struct pfsync_state) * count);
 			if (error)
 				break;
 			out = ps->ps_states + nr;
 		}
 DIOCGETSTATES_full:
 		ps->ps_len = sizeof(struct pfsync_state) * nr;
 		free(pstore, M_TEMP);
 
 		break;
 	}
 
 	case DIOCGETSTATESV2: {
 		struct pfioc_states_v2	*ps = (struct pfioc_states_v2 *)addr;
 		struct pf_kstate	*s;
 		struct pf_state_export	*pstore, *p;
 		int i, nr;
 		size_t slice_count = 16, count;
 		void *out;
 
 		if (ps->ps_req_version > PF_STATE_VERSION) {
 			error = ENOTSUP;
 			break;
 		}
 
 		if (ps->ps_len <= 0) {
 			nr = uma_zone_get_cur(V_pf_state_z);
 			ps->ps_len = sizeof(struct pf_state_export) * nr;
 			break;
 		}
 
 		out = ps->ps_states;
 		pstore = mallocarray(slice_count,
 		    sizeof(struct pf_state_export), M_TEMP, M_WAITOK | M_ZERO);
 		nr = 0;
 
 		for (i = 0; i <= pf_hashmask; i++) {
 			struct pf_idhash *ih = &V_pf_idhash[i];
 
 DIOCGETSTATESV2_retry:
 			p = pstore;
 
 			if (LIST_EMPTY(&ih->states))
 				continue;
 
 			PF_HASHROW_LOCK(ih);
 			count = 0;
 			LIST_FOREACH(s, &ih->states, entry) {
 				if (s->timeout == PFTM_UNLINKED)
 					continue;
 				count++;
 			}
 
 			if (count > slice_count) {
 				PF_HASHROW_UNLOCK(ih);
 				free(pstore, M_TEMP);
 				slice_count = count * 2;
 				pstore = mallocarray(slice_count,
 				    sizeof(struct pf_state_export), M_TEMP,
 				    M_WAITOK | M_ZERO);
 				goto DIOCGETSTATESV2_retry;
 			}
 
 			if ((nr+count) * sizeof(*p) > ps->ps_len) {
 				PF_HASHROW_UNLOCK(ih);
 				goto DIOCGETSTATESV2_full;
 			}
 
 			LIST_FOREACH(s, &ih->states, entry) {
 				if (s->timeout == PFTM_UNLINKED)
 					continue;
 
 				pf_state_export(p, s);
 				p++;
 				nr++;
 			}
 			PF_HASHROW_UNLOCK(ih);
 			error = copyout(pstore, out,
 			    sizeof(struct pf_state_export) * count);
 			if (error)
 				break;
 			out = ps->ps_states + nr;
 		}
 DIOCGETSTATESV2_full:
 		ps->ps_len = nr * sizeof(struct pf_state_export);
 		free(pstore, M_TEMP);
 
 		break;
 	}
 
 	case DIOCGETSTATUS: {
 		struct pf_status *s = (struct pf_status *)addr;
 
 		PF_RULES_RLOCK();
 		s->running = V_pf_status.running;
 		s->since   = V_pf_status.since;
 		s->debug   = V_pf_status.debug;
 		s->hostid  = V_pf_status.hostid;
 		s->states  = V_pf_status.states;
 		s->src_nodes = V_pf_status.src_nodes;
 
 		for (int i = 0; i < PFRES_MAX; i++)
 			s->counters[i] =
 			    counter_u64_fetch(V_pf_status.counters[i]);
 		for (int i = 0; i < LCNT_MAX; i++)
 			s->lcounters[i] =
 			    counter_u64_fetch(V_pf_status.lcounters[i]);
 		for (int i = 0; i < FCNT_MAX; i++)
 			s->fcounters[i] =
 			    pf_counter_u64_fetch(&V_pf_status.fcounters[i]);
 		for (int i = 0; i < SCNT_MAX; i++)
 			s->scounters[i] =
 			    counter_u64_fetch(V_pf_status.scounters[i]);
 
 		bcopy(V_pf_status.ifname, s->ifname, IFNAMSIZ);
 		bcopy(V_pf_status.pf_chksum, s->pf_chksum,
 		    PF_MD5_DIGEST_LENGTH);
 
 		pfi_update_status(s->ifname, s);
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCGETSTATUSNV: {
 		error = pf_getstatus((struct pfioc_nv *)addr);
 		break;
 	}
 
 	case DIOCSETSTATUSIF: {
 		struct pfioc_if	*pi = (struct pfioc_if *)addr;
 
 		if (pi->ifname[0] == 0) {
 			bzero(V_pf_status.ifname, IFNAMSIZ);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pf_user_strcpy(V_pf_status.ifname, pi->ifname, IFNAMSIZ);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCCLRSTATUS: {
 		PF_RULES_WLOCK();
 		for (int i = 0; i < PFRES_MAX; i++)
 			counter_u64_zero(V_pf_status.counters[i]);
 		for (int i = 0; i < FCNT_MAX; i++)
 			pf_counter_u64_zero(&V_pf_status.fcounters[i]);
 		for (int i = 0; i < SCNT_MAX; i++)
 			counter_u64_zero(V_pf_status.scounters[i]);
 		for (int i = 0; i < KLCNT_MAX; i++)
 			counter_u64_zero(V_pf_status.lcounters[i]);
 		V_pf_status.since = time_second;
 		if (*V_pf_status.ifname)
 			pfi_update_status(V_pf_status.ifname, NULL);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCNATLOOK: {
 		struct pfioc_natlook	*pnl = (struct pfioc_natlook *)addr;
 		struct pf_state_key	*sk;
 		struct pf_kstate	*state;
 		struct pf_state_key_cmp	 key;
 		int			 m = 0, direction = pnl->direction;
 		int			 sidx, didx;
 
 		/* NATLOOK src and dst are reversed, so reverse sidx/didx */
 		sidx = (direction == PF_IN) ? 1 : 0;
 		didx = (direction == PF_IN) ? 0 : 1;
 
 		if (!pnl->proto ||
 		    PF_AZERO(&pnl->saddr, pnl->af) ||
 		    PF_AZERO(&pnl->daddr, pnl->af) ||
 		    ((pnl->proto == IPPROTO_TCP ||
 		    pnl->proto == IPPROTO_UDP) &&
 		    (!pnl->dport || !pnl->sport)))
 			error = EINVAL;
 		else {
 			bzero(&key, sizeof(key));
 			key.af = pnl->af;
 			key.proto = pnl->proto;
 			PF_ACPY(&key.addr[sidx], &pnl->saddr, pnl->af);
 			key.port[sidx] = pnl->sport;
 			PF_ACPY(&key.addr[didx], &pnl->daddr, pnl->af);
 			key.port[didx] = pnl->dport;
 
 			state = pf_find_state_all(&key, direction, &m);
 			if (state == NULL) {
 				error = ENOENT;
 			} else {
 				if (m > 1) {
 					PF_STATE_UNLOCK(state);
 					error = E2BIG;	/* more than one state */
 				} else {
 					sk = state->key[sidx];
 					PF_ACPY(&pnl->rsaddr, &sk->addr[sidx], sk->af);
 					pnl->rsport = sk->port[sidx];
 					PF_ACPY(&pnl->rdaddr, &sk->addr[didx], sk->af);
 					pnl->rdport = sk->port[didx];
 					PF_STATE_UNLOCK(state);
 				}
 			}
 		}
 		break;
 	}
 
 	case DIOCSETTIMEOUT: {
 		struct pfioc_tm	*pt = (struct pfioc_tm *)addr;
 		int		 old;
 
 		if (pt->timeout < 0 || pt->timeout >= PFTM_MAX ||
 		    pt->seconds < 0) {
 			error = EINVAL;
 			break;
 		}
 		PF_RULES_WLOCK();
 		old = V_pf_default_rule.timeout[pt->timeout];
 		if (pt->timeout == PFTM_INTERVAL && pt->seconds == 0)
 			pt->seconds = 1;
 		V_pf_default_rule.timeout[pt->timeout] = pt->seconds;
 		if (pt->timeout == PFTM_INTERVAL && pt->seconds < old)
 			wakeup(pf_purge_thread);
 		pt->seconds = old;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCGETTIMEOUT: {
 		struct pfioc_tm	*pt = (struct pfioc_tm *)addr;
 
 		if (pt->timeout < 0 || pt->timeout >= PFTM_MAX) {
 			error = EINVAL;
 			break;
 		}
 		PF_RULES_RLOCK();
 		pt->seconds = V_pf_default_rule.timeout[pt->timeout];
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCGETLIMIT: {
 		struct pfioc_limit	*pl = (struct pfioc_limit *)addr;
 
 		if (pl->index < 0 || pl->index >= PF_LIMIT_MAX) {
 			error = EINVAL;
 			break;
 		}
 		PF_RULES_RLOCK();
 		pl->limit = V_pf_limits[pl->index].limit;
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCSETLIMIT: {
 		struct pfioc_limit	*pl = (struct pfioc_limit *)addr;
 		int			 old_limit;
 
 		PF_RULES_WLOCK();
 		if (pl->index < 0 || pl->index >= PF_LIMIT_MAX ||
 		    V_pf_limits[pl->index].zone == NULL) {
 			PF_RULES_WUNLOCK();
 			error = EINVAL;
 			break;
 		}
 		uma_zone_set_max(V_pf_limits[pl->index].zone, pl->limit);
 		old_limit = V_pf_limits[pl->index].limit;
 		V_pf_limits[pl->index].limit = pl->limit;
 		pl->limit = old_limit;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCSETDEBUG: {
 		u_int32_t	*level = (u_int32_t *)addr;
 
 		PF_RULES_WLOCK();
 		V_pf_status.debug = *level;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCCLRRULECTRS: {
 		/* obsoleted by DIOCGETRULE with action=PF_GET_CLR_CNTR */
 		struct pf_kruleset	*ruleset = &pf_main_ruleset;
 		struct pf_krule		*rule;
 
 		PF_RULES_WLOCK();
 		TAILQ_FOREACH(rule,
 		    ruleset->rules[PF_RULESET_FILTER].active.ptr, entries) {
 			pf_counter_u64_zero(&rule->evaluations);
 			for (int i = 0; i < 2; i++) {
 				pf_counter_u64_zero(&rule->packets[i]);
 				pf_counter_u64_zero(&rule->bytes[i]);
 			}
 		}
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCGIFSPEEDV0:
 	case DIOCGIFSPEEDV1: {
 		struct pf_ifspeed_v1	*psp = (struct pf_ifspeed_v1 *)addr;
 		struct pf_ifspeed_v1	ps;
 		struct ifnet		*ifp;
 
 		if (psp->ifname[0] == '\0') {
 			error = EINVAL;
 			break;
 		}
 
 		error = pf_user_strcpy(ps.ifname, psp->ifname, IFNAMSIZ);
 		if (error != 0)
 			break;
 		ifp = ifunit(ps.ifname);
 		if (ifp != NULL) {
 			psp->baudrate32 =
 			    (u_int32_t)uqmin(ifp->if_baudrate, UINT_MAX);
 			if (cmd == DIOCGIFSPEEDV1)
 				psp->baudrate = ifp->if_baudrate;
 		} else {
 			error = EINVAL;
 		}
 		break;
 	}
 
 #ifdef ALTQ
 	case DIOCSTARTALTQ: {
 		struct pf_altq		*altq;
 
 		PF_RULES_WLOCK();
 		/* enable all altq interfaces on active list */
 		TAILQ_FOREACH(altq, V_pf_altq_ifs_active, entries) {
 			if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
 				error = pf_enable_altq(altq);
 				if (error != 0)
 					break;
 			}
 		}
 		if (error == 0)
 			V_pf_altq_running = 1;
 		PF_RULES_WUNLOCK();
 		DPFPRINTF(PF_DEBUG_MISC, ("altq: started\n"));
 		break;
 	}
 
 	case DIOCSTOPALTQ: {
 		struct pf_altq		*altq;
 
 		PF_RULES_WLOCK();
 		/* disable all altq interfaces on active list */
 		TAILQ_FOREACH(altq, V_pf_altq_ifs_active, entries) {
 			if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) == 0) {
 				error = pf_disable_altq(altq);
 				if (error != 0)
 					break;
 			}
 		}
 		if (error == 0)
 			V_pf_altq_running = 0;
 		PF_RULES_WUNLOCK();
 		DPFPRINTF(PF_DEBUG_MISC, ("altq: stopped\n"));
 		break;
 	}
 
 	case DIOCADDALTQV0:
 	case DIOCADDALTQV1: {
 		struct pfioc_altq_v1	*pa = (struct pfioc_altq_v1 *)addr;
 		struct pf_altq		*altq, *a;
 		struct ifnet		*ifp;
 
 		altq = malloc(sizeof(*altq), M_PFALTQ, M_WAITOK | M_ZERO);
 		error = pf_import_kaltq(pa, altq, IOCPARM_LEN(cmd));
 		if (error)
 			break;
 		altq->local_flags = 0;
 
 		PF_RULES_WLOCK();
 		if (pa->ticket != V_ticket_altqs_inactive) {
 			PF_RULES_WUNLOCK();
 			free(altq, M_PFALTQ);
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * if this is for a queue, find the discipline and
 		 * copy the necessary fields
 		 */
 		if (altq->qname[0] != 0) {
 			if ((altq->qid = pf_qname2qid(altq->qname)) == 0) {
 				PF_RULES_WUNLOCK();
 				error = EBUSY;
 				free(altq, M_PFALTQ);
 				break;
 			}
 			altq->altq_disc = NULL;
 			TAILQ_FOREACH(a, V_pf_altq_ifs_inactive, entries) {
 				if (strncmp(a->ifname, altq->ifname,
 				    IFNAMSIZ) == 0) {
 					altq->altq_disc = a->altq_disc;
 					break;
 				}
 			}
 		}
 
 		if ((ifp = ifunit(altq->ifname)) == NULL)
 			altq->local_flags |= PFALTQ_FLAG_IF_REMOVED;
 		else
 			error = altq_add(ifp, altq);
 
 		if (error) {
 			PF_RULES_WUNLOCK();
 			free(altq, M_PFALTQ);
 			break;
 		}
 
 		if (altq->qname[0] != 0)
 			TAILQ_INSERT_TAIL(V_pf_altqs_inactive, altq, entries);
 		else
 			TAILQ_INSERT_TAIL(V_pf_altq_ifs_inactive, altq, entries);
 		/* version error check done on import above */
 		pf_export_kaltq(altq, pa, IOCPARM_LEN(cmd));
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCGETALTQSV0:
 	case DIOCGETALTQSV1: {
 		struct pfioc_altq_v1	*pa = (struct pfioc_altq_v1 *)addr;
 		struct pf_altq		*altq;
 
 		PF_RULES_RLOCK();
 		pa->nr = 0;
 		TAILQ_FOREACH(altq, V_pf_altq_ifs_active, entries)
 			pa->nr++;
 		TAILQ_FOREACH(altq, V_pf_altqs_active, entries)
 			pa->nr++;
 		pa->ticket = V_ticket_altqs_active;
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCGETALTQV0:
 	case DIOCGETALTQV1: {
 		struct pfioc_altq_v1	*pa = (struct pfioc_altq_v1 *)addr;
 		struct pf_altq		*altq;
 
 		PF_RULES_RLOCK();
 		if (pa->ticket != V_ticket_altqs_active) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		altq = pf_altq_get_nth_active(pa->nr);
 		if (altq == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		pf_export_kaltq(altq, pa, IOCPARM_LEN(cmd));
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCCHANGEALTQV0:
 	case DIOCCHANGEALTQV1:
 		/* CHANGEALTQ not supported yet! */
 		error = ENODEV;
 		break;
 
 	case DIOCGETQSTATSV0:
 	case DIOCGETQSTATSV1: {
 		struct pfioc_qstats_v1	*pq = (struct pfioc_qstats_v1 *)addr;
 		struct pf_altq		*altq;
 		int			 nbytes;
 		u_int32_t		 version;
 
 		PF_RULES_RLOCK();
 		if (pq->ticket != V_ticket_altqs_active) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		nbytes = pq->nbytes;
 		altq = pf_altq_get_nth_active(pq->nr);
 		if (altq == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 
 		if ((altq->local_flags & PFALTQ_FLAG_IF_REMOVED) != 0) {
 			PF_RULES_RUNLOCK();
 			error = ENXIO;
 			break;
 		}
 		PF_RULES_RUNLOCK();
 		if (cmd == DIOCGETQSTATSV0)
 			version = 0;  /* DIOCGETQSTATSV0 means stats struct v0 */
 		else
 			version = pq->version;
 		error = altq_getqstats(altq, pq->buf, &nbytes, version);
 		if (error == 0) {
 			pq->scheduler = altq->scheduler;
 			pq->nbytes = nbytes;
 		}
 		break;
 	}
 #endif /* ALTQ */
 
 	case DIOCBEGINADDRS: {
 		struct pfioc_pooladdr	*pp = (struct pfioc_pooladdr *)addr;
 
 		PF_RULES_WLOCK();
 		pf_empty_kpool(&V_pf_pabuf);
 		pp->ticket = ++V_ticket_pabuf;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCADDADDR: {
 		struct pfioc_pooladdr	*pp = (struct pfioc_pooladdr *)addr;
 		struct pf_kpooladdr	*pa;
 		struct pfi_kkif		*kif = NULL;
 
 #ifndef INET
 		if (pp->af == AF_INET) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 #endif /* INET */
 #ifndef INET6
 		if (pp->af == AF_INET6) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 #endif /* INET6 */
 		if (pp->addr.addr.type != PF_ADDR_ADDRMASK &&
 		    pp->addr.addr.type != PF_ADDR_DYNIFTL &&
 		    pp->addr.addr.type != PF_ADDR_TABLE) {
 			error = EINVAL;
 			break;
 		}
 		if (pp->addr.addr.p.dyn != NULL) {
 			error = EINVAL;
 			break;
 		}
 		pa = malloc(sizeof(*pa), M_PFRULE, M_WAITOK);
 		error = pf_pooladdr_to_kpooladdr(&pp->addr, pa);
 		if (error != 0)
 			break;
 		if (pa->ifname[0])
 			kif = pf_kkif_create(M_WAITOK);
 		PF_RULES_WLOCK();
 		if (pp->ticket != V_ticket_pabuf) {
 			PF_RULES_WUNLOCK();
 			if (pa->ifname[0])
 				pf_kkif_free(kif);
 			free(pa, M_PFRULE);
 			error = EBUSY;
 			break;
 		}
 		if (pa->ifname[0]) {
 			pa->kif = pfi_kkif_attach(kif, pa->ifname);
 			kif = NULL;
 			pfi_kkif_ref(pa->kif);
 		} else
 			pa->kif = NULL;
 		if (pa->addr.type == PF_ADDR_DYNIFTL && ((error =
 		    pfi_dynaddr_setup(&pa->addr, pp->af)) != 0)) {
 			if (pa->ifname[0])
 				pfi_kkif_unref(pa->kif);
 			PF_RULES_WUNLOCK();
 			free(pa, M_PFRULE);
 			break;
 		}
 		TAILQ_INSERT_TAIL(&V_pf_pabuf, pa, entries);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCGETADDRS: {
 		struct pfioc_pooladdr	*pp = (struct pfioc_pooladdr *)addr;
 		struct pf_kpool		*pool;
 		struct pf_kpooladdr	*pa;
 
 		pp->anchor[sizeof(pp->anchor) - 1] = 0;
 		pp->nr = 0;
 
 		PF_RULES_RLOCK();
 		pool = pf_get_kpool(pp->anchor, pp->ticket, pp->r_action,
 		    pp->r_num, 0, 1, 0);
 		if (pool == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		TAILQ_FOREACH(pa, &pool->list, entries)
 			pp->nr++;
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCGETADDR: {
 		struct pfioc_pooladdr	*pp = (struct pfioc_pooladdr *)addr;
 		struct pf_kpool		*pool;
 		struct pf_kpooladdr	*pa;
 		u_int32_t		 nr = 0;
 
 		pp->anchor[sizeof(pp->anchor) - 1] = 0;
 
 		PF_RULES_RLOCK();
 		pool = pf_get_kpool(pp->anchor, pp->ticket, pp->r_action,
 		    pp->r_num, 0, 1, 1);
 		if (pool == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		pa = TAILQ_FIRST(&pool->list);
 		while ((pa != NULL) && (nr < pp->nr)) {
 			pa = TAILQ_NEXT(pa, entries);
 			nr++;
 		}
 		if (pa == NULL) {
 			PF_RULES_RUNLOCK();
 			error = EBUSY;
 			break;
 		}
 		pf_kpooladdr_to_pooladdr(pa, &pp->addr);
 		pf_addr_copyout(&pp->addr.addr);
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCCHANGEADDR: {
 		struct pfioc_pooladdr	*pca = (struct pfioc_pooladdr *)addr;
 		struct pf_kpool		*pool;
 		struct pf_kpooladdr	*oldpa = NULL, *newpa = NULL;
 		struct pf_kruleset	*ruleset;
 		struct pfi_kkif		*kif = NULL;
 
 		pca->anchor[sizeof(pca->anchor) - 1] = 0;
 
 		if (pca->action < PF_CHANGE_ADD_HEAD ||
 		    pca->action > PF_CHANGE_REMOVE) {
 			error = EINVAL;
 			break;
 		}
 		if (pca->addr.addr.type != PF_ADDR_ADDRMASK &&
 		    pca->addr.addr.type != PF_ADDR_DYNIFTL &&
 		    pca->addr.addr.type != PF_ADDR_TABLE) {
 			error = EINVAL;
 			break;
 		}
 		if (pca->addr.addr.p.dyn != NULL) {
 			error = EINVAL;
 			break;
 		}
 
 		if (pca->action != PF_CHANGE_REMOVE) {
 #ifndef INET
 			if (pca->af == AF_INET) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 #endif /* INET */
 #ifndef INET6
 			if (pca->af == AF_INET6) {
 				error = EAFNOSUPPORT;
 				break;
 			}
 #endif /* INET6 */
 			newpa = malloc(sizeof(*newpa), M_PFRULE, M_WAITOK);
 			bcopy(&pca->addr, newpa, sizeof(struct pf_pooladdr));
 			if (newpa->ifname[0])
 				kif = pf_kkif_create(M_WAITOK);
 			newpa->kif = NULL;
 		}
 #define	ERROUT(x)	ERROUT_IOCTL(DIOCCHANGEADDR_error, x)
 		PF_RULES_WLOCK();
 		ruleset = pf_find_kruleset(pca->anchor);
 		if (ruleset == NULL)
 			ERROUT(EBUSY);
 
 		pool = pf_get_kpool(pca->anchor, pca->ticket, pca->r_action,
 		    pca->r_num, pca->r_last, 1, 1);
 		if (pool == NULL)
 			ERROUT(EBUSY);
 
 		if (pca->action != PF_CHANGE_REMOVE) {
 			if (newpa->ifname[0]) {
 				newpa->kif = pfi_kkif_attach(kif, newpa->ifname);
 				pfi_kkif_ref(newpa->kif);
 				kif = NULL;
 			}
 
 			switch (newpa->addr.type) {
 			case PF_ADDR_DYNIFTL:
 				error = pfi_dynaddr_setup(&newpa->addr,
 				    pca->af);
 				break;
 			case PF_ADDR_TABLE:
 				newpa->addr.p.tbl = pfr_attach_table(ruleset,
 				    newpa->addr.v.tblname);
 				if (newpa->addr.p.tbl == NULL)
 					error = ENOMEM;
 				break;
 			}
 			if (error)
 				goto DIOCCHANGEADDR_error;
 		}
 
 		switch (pca->action) {
 		case PF_CHANGE_ADD_HEAD:
 			oldpa = TAILQ_FIRST(&pool->list);
 			break;
 		case PF_CHANGE_ADD_TAIL:
 			oldpa = TAILQ_LAST(&pool->list, pf_kpalist);
 			break;
 		default:
 			oldpa = TAILQ_FIRST(&pool->list);
 			for (int i = 0; oldpa && i < pca->nr; i++)
 				oldpa = TAILQ_NEXT(oldpa, entries);
 
 			if (oldpa == NULL)
 				ERROUT(EINVAL);
 		}
 
 		if (pca->action == PF_CHANGE_REMOVE) {
 			TAILQ_REMOVE(&pool->list, oldpa, entries);
 			switch (oldpa->addr.type) {
 			case PF_ADDR_DYNIFTL:
 				pfi_dynaddr_remove(oldpa->addr.p.dyn);
 				break;
 			case PF_ADDR_TABLE:
 				pfr_detach_table(oldpa->addr.p.tbl);
 				break;
 			}
 			if (oldpa->kif)
 				pfi_kkif_unref(oldpa->kif);
 			free(oldpa, M_PFRULE);
 		} else {
 			if (oldpa == NULL)
 				TAILQ_INSERT_TAIL(&pool->list, newpa, entries);
 			else if (pca->action == PF_CHANGE_ADD_HEAD ||
 			    pca->action == PF_CHANGE_ADD_BEFORE)
 				TAILQ_INSERT_BEFORE(oldpa, newpa, entries);
 			else
 				TAILQ_INSERT_AFTER(&pool->list, oldpa,
 				    newpa, entries);
 		}
 
 		pool->cur = TAILQ_FIRST(&pool->list);
 		PF_ACPY(&pool->counter, &pool->cur->addr.v.a.addr, pca->af);
 		PF_RULES_WUNLOCK();
 		break;
 
 #undef ERROUT
 DIOCCHANGEADDR_error:
 		if (newpa != NULL) {
 			if (newpa->kif)
 				pfi_kkif_unref(newpa->kif);
 			free(newpa, M_PFRULE);
 		}
 		PF_RULES_WUNLOCK();
 		pf_kkif_free(kif);
 		break;
 	}
 
 	case DIOCGETRULESETS: {
 		struct pfioc_ruleset	*pr = (struct pfioc_ruleset *)addr;
 		struct pf_kruleset	*ruleset;
 		struct pf_kanchor	*anchor;
 
 		pr->path[sizeof(pr->path) - 1] = 0;
 
 		PF_RULES_RLOCK();
 		if ((ruleset = pf_find_kruleset(pr->path)) == NULL) {
 			PF_RULES_RUNLOCK();
 			error = ENOENT;
 			break;
 		}
 		pr->nr = 0;
 		if (ruleset->anchor == NULL) {
 			/* XXX kludge for pf_main_ruleset */
 			RB_FOREACH(anchor, pf_kanchor_global, &V_pf_anchors)
 				if (anchor->parent == NULL)
 					pr->nr++;
 		} else {
 			RB_FOREACH(anchor, pf_kanchor_node,
 			    &ruleset->anchor->children)
 				pr->nr++;
 		}
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCGETRULESET: {
 		struct pfioc_ruleset	*pr = (struct pfioc_ruleset *)addr;
 		struct pf_kruleset	*ruleset;
 		struct pf_kanchor	*anchor;
 		u_int32_t		 nr = 0;
 
 		pr->path[sizeof(pr->path) - 1] = 0;
 
 		PF_RULES_RLOCK();
 		if ((ruleset = pf_find_kruleset(pr->path)) == NULL) {
 			PF_RULES_RUNLOCK();
 			error = ENOENT;
 			break;
 		}
 		pr->name[0] = 0;
 		if (ruleset->anchor == NULL) {
 			/* XXX kludge for pf_main_ruleset */
 			RB_FOREACH(anchor, pf_kanchor_global, &V_pf_anchors)
 				if (anchor->parent == NULL && nr++ == pr->nr) {
 					strlcpy(pr->name, anchor->name,
 					    sizeof(pr->name));
 					break;
 				}
 		} else {
 			RB_FOREACH(anchor, pf_kanchor_node,
 			    &ruleset->anchor->children)
 				if (nr++ == pr->nr) {
 					strlcpy(pr->name, anchor->name,
 					    sizeof(pr->name));
 					break;
 				}
 		}
 		if (!pr->name[0])
 			error = EBUSY;
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCRCLRTABLES: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 
 		if (io->pfrio_esize != 0) {
 			error = ENODEV;
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_clr_tables(&io->pfrio_table, &io->pfrio_ndel,
 		    io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCRADDTABLES: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_table *pfrts;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
 			break;
 		}
 
 		if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) {
 			error = ENOMEM;
 			break;
 		}
 
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
 		pfrts = mallocarray(io->pfrio_size, sizeof(struct pfr_table),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_add_tables(pfrts, io->pfrio_size,
 		    &io->pfrio_nadd, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		free(pfrts, M_TEMP);
 		break;
 	}
 
 	case DIOCRDELTABLES: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_table *pfrts;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
 			break;
 		}
 
 		if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) {
 			error = ENOMEM;
 			break;
 		}
 
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
 		pfrts = mallocarray(io->pfrio_size, sizeof(struct pfr_table),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_del_tables(pfrts, io->pfrio_size,
 		    &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		free(pfrts, M_TEMP);
 		break;
 	}
 
 	case DIOCRGETTABLES: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_table *pfrts;
 		size_t totlen;
 		int n;
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
 			break;
 		}
 		PF_RULES_RLOCK();
 		n = pfr_table_count(&io->pfrio_table, io->pfrio_flags);
 		if (n < 0) {
 			PF_RULES_RUNLOCK();
 			error = EINVAL;
 			break;
 		}
 		io->pfrio_size = min(io->pfrio_size, n);
 
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
 
 		pfrts = mallocarray(io->pfrio_size, sizeof(struct pfr_table),
 		    M_TEMP, M_NOWAIT | M_ZERO);
 		if (pfrts == NULL) {
 			error = ENOMEM;
 			PF_RULES_RUNLOCK();
 			break;
 		}
 		error = pfr_get_tables(&io->pfrio_table, pfrts,
 		    &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_RUNLOCK();
 		if (error == 0)
 			error = copyout(pfrts, io->pfrio_buffer, totlen);
 		free(pfrts, M_TEMP);
 		break;
 	}
 
 	case DIOCRGETTSTATS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_tstats *pfrtstats;
 		size_t totlen;
 		int n;
 
 		if (io->pfrio_esize != sizeof(struct pfr_tstats)) {
 			error = ENODEV;
 			break;
 		}
 		PF_TABLE_STATS_LOCK();
 		PF_RULES_RLOCK();
 		n = pfr_table_count(&io->pfrio_table, io->pfrio_flags);
 		if (n < 0) {
 			PF_RULES_RUNLOCK();
 			PF_TABLE_STATS_UNLOCK();
 			error = EINVAL;
 			break;
 		}
 		io->pfrio_size = min(io->pfrio_size, n);
 
 		totlen = io->pfrio_size * sizeof(struct pfr_tstats);
 		pfrtstats = mallocarray(io->pfrio_size,
 		    sizeof(struct pfr_tstats), M_TEMP, M_NOWAIT | M_ZERO);
 		if (pfrtstats == NULL) {
 			error = ENOMEM;
 			PF_RULES_RUNLOCK();
 			PF_TABLE_STATS_UNLOCK();
 			break;
 		}
 		error = pfr_get_tstats(&io->pfrio_table, pfrtstats,
 		    &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_RUNLOCK();
 		PF_TABLE_STATS_UNLOCK();
 		if (error == 0)
 			error = copyout(pfrtstats, io->pfrio_buffer, totlen);
 		free(pfrtstats, M_TEMP);
 		break;
 	}
 
 	case DIOCRCLRTSTATS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_table *pfrts;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
 			break;
 		}
 
 		if (io->pfrio_size < 0 || io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_table))) {
 			/* We used to count tables and use the minimum required
 			 * size, so we didn't fail on overly large requests.
 			 * Keep doing so. */
 			io->pfrio_size = pf_ioctl_maxcount;
 			break;
 		}
 
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
 		pfrts = mallocarray(io->pfrio_size, sizeof(struct pfr_table),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
 			break;
 		}
 
 		PF_TABLE_STATS_LOCK();
 		PF_RULES_RLOCK();
 		error = pfr_clr_tstats(pfrts, io->pfrio_size,
 		    &io->pfrio_nzero, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_RUNLOCK();
 		PF_TABLE_STATS_UNLOCK();
 		free(pfrts, M_TEMP);
 		break;
 	}
 
 	case DIOCRSETTFLAGS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_table *pfrts;
 		size_t totlen;
 		int n;
 
 		if (io->pfrio_esize != sizeof(struct pfr_table)) {
 			error = ENODEV;
 			break;
 		}
 
 		PF_RULES_RLOCK();
 		n = pfr_table_count(&io->pfrio_table, io->pfrio_flags);
 		if (n < 0) {
 			PF_RULES_RUNLOCK();
 			error = EINVAL;
 			break;
 		}
 
 		io->pfrio_size = min(io->pfrio_size, n);
 		PF_RULES_RUNLOCK();
 
 		totlen = io->pfrio_size * sizeof(struct pfr_table);
 		pfrts = mallocarray(io->pfrio_size, sizeof(struct pfr_table),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfrts, totlen);
 		if (error) {
 			free(pfrts, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_set_tflags(pfrts, io->pfrio_size,
 		    io->pfrio_setflag, io->pfrio_clrflag, &io->pfrio_nchange,
 		    &io->pfrio_ndel, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		free(pfrts, M_TEMP);
 		break;
 	}
 
 	case DIOCRCLRADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 
 		if (io->pfrio_esize != 0) {
 			error = ENODEV;
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_clr_addrs(&io->pfrio_table, &io->pfrio_ndel,
 		    io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCRADDADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_add_addrs(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_nadd, io->pfrio_flags |
 		    PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRDELADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_del_addrs(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_ndel, io->pfrio_flags |
 		    PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRSETADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen, count;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		if (io->pfrio_size < 0 || io->pfrio_size2 < 0) {
 			error = EINVAL;
 			break;
 		}
 		count = max(io->pfrio_size, io->pfrio_size2);
 		if (count > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(count, sizeof(struct pfr_addr))) {
 			error = EINVAL;
 			break;
 		}
 		totlen = count * sizeof(struct pfr_addr);
 		pfras = mallocarray(count, sizeof(struct pfr_addr), M_TEMP,
 		    M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_set_addrs(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_size2, &io->pfrio_nadd,
 		    &io->pfrio_ndel, &io->pfrio_nchange, io->pfrio_flags |
 		    PFR_FLAG_USERIOCTL, 0);
 		PF_RULES_WUNLOCK();
 		if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRGETADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
 		    M_TEMP, M_WAITOK | M_ZERO);
 		PF_RULES_RLOCK();
 		error = pfr_get_addrs(&io->pfrio_table, pfras,
 		    &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_RUNLOCK();
 		if (error == 0)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRGETASTATS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_astats *pfrastats;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_astats)) {
 			error = ENODEV;
 			break;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_astats))) {
 			error = EINVAL;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_astats);
 		pfrastats = mallocarray(io->pfrio_size,
 		    sizeof(struct pfr_astats), M_TEMP, M_WAITOK | M_ZERO);
 		PF_RULES_RLOCK();
 		error = pfr_get_astats(&io->pfrio_table, pfrastats,
 		    &io->pfrio_size, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_RUNLOCK();
 		if (error == 0)
 			error = copyout(pfrastats, io->pfrio_buffer, totlen);
 		free(pfrastats, M_TEMP);
 		break;
 	}
 
 	case DIOCRCLRASTATS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_clr_astats(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_nzero, io->pfrio_flags |
 		    PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		if (error == 0 && io->pfrio_flags & PFR_FLAG_FEEDBACK)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRTSTADDRS: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_RLOCK();
 		error = pfr_tst_addrs(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_nmatch, io->pfrio_flags |
 		    PFR_FLAG_USERIOCTL);
 		PF_RULES_RUNLOCK();
 		if (error == 0)
 			error = copyout(pfras, io->pfrio_buffer, totlen);
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCRINADEFINE: {
 		struct pfioc_table *io = (struct pfioc_table *)addr;
 		struct pfr_addr *pfras;
 		size_t totlen;
 
 		if (io->pfrio_esize != sizeof(struct pfr_addr)) {
 			error = ENODEV;
 			break;
 		}
 		if (io->pfrio_size < 0 ||
 		    io->pfrio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfrio_size, sizeof(struct pfr_addr))) {
 			error = EINVAL;
 			break;
 		}
 		totlen = io->pfrio_size * sizeof(struct pfr_addr);
 		pfras = mallocarray(io->pfrio_size, sizeof(struct pfr_addr),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->pfrio_buffer, pfras, totlen);
 		if (error) {
 			free(pfras, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		error = pfr_ina_define(&io->pfrio_table, pfras,
 		    io->pfrio_size, &io->pfrio_nadd, &io->pfrio_naddr,
 		    io->pfrio_ticket, io->pfrio_flags | PFR_FLAG_USERIOCTL);
 		PF_RULES_WUNLOCK();
 		free(pfras, M_TEMP);
 		break;
 	}
 
 	case DIOCOSFPADD: {
 		struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr;
 		PF_RULES_WLOCK();
 		error = pf_osfp_add(io);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCOSFPGET: {
 		struct pf_osfp_ioctl *io = (struct pf_osfp_ioctl *)addr;
 		PF_RULES_RLOCK();
 		error = pf_osfp_get(io);
 		PF_RULES_RUNLOCK();
 		break;
 	}
 
 	case DIOCXBEGIN: {
 		struct pfioc_trans	*io = (struct pfioc_trans *)addr;
 		struct pfioc_trans_e	*ioes, *ioe;
 		size_t			 totlen;
 		int			 i;
 
 		if (io->esize != sizeof(*ioe)) {
 			error = ENODEV;
 			break;
 		}
 		if (io->size < 0 ||
 		    io->size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
 			error = EINVAL;
 			break;
 		}
 		totlen = sizeof(struct pfioc_trans_e) * io->size;
 		ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->array, ioes, totlen);
 		if (error) {
 			free(ioes, M_TEMP);
 			break;
 		}
 		/* Ensure there's no more ethernet rules to clean up. */
 		NET_EPOCH_DRAIN_CALLBACKS();
 		PF_RULES_WLOCK();
 		for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
 			ioe->anchor[sizeof(ioe->anchor) - 1] = '\0';
 			switch (ioe->rs_num) {
 			case PF_RULESET_ETH:
 				if ((error = pf_begin_eth(&ioe->ticket, ioe->anchor))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail;
 				}
 				break;
 #ifdef ALTQ
 			case PF_RULESET_ALTQ:
 				if (ioe->anchor[0]) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EINVAL;
 					goto fail;
 				}
 				if ((error = pf_begin_altq(&ioe->ticket))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail;
 				}
 				break;
 #endif /* ALTQ */
 			case PF_RULESET_TABLE:
 			    {
 				struct pfr_table table;
 
 				bzero(&table, sizeof(table));
 				strlcpy(table.pfrt_anchor, ioe->anchor,
 				    sizeof(table.pfrt_anchor));
 				if ((error = pfr_ina_begin(&table,
 				    &ioe->ticket, NULL, 0))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail;
 				}
 				break;
 			    }
 			default:
 				if ((error = pf_begin_rules(&ioe->ticket,
 				    ioe->rs_num, ioe->anchor))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail;
 				}
 				break;
 			}
 		}
 		PF_RULES_WUNLOCK();
 		error = copyout(ioes, io->array, totlen);
 		free(ioes, M_TEMP);
 		break;
 	}
 
 	case DIOCXROLLBACK: {
 		struct pfioc_trans	*io = (struct pfioc_trans *)addr;
 		struct pfioc_trans_e	*ioe, *ioes;
 		size_t			 totlen;
 		int			 i;
 
 		if (io->esize != sizeof(*ioe)) {
 			error = ENODEV;
 			break;
 		}
 		if (io->size < 0 ||
 		    io->size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
 			error = EINVAL;
 			break;
 		}
 		totlen = sizeof(struct pfioc_trans_e) * io->size;
 		ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->array, ioes, totlen);
 		if (error) {
 			free(ioes, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
 			ioe->anchor[sizeof(ioe->anchor) - 1] = '\0';
 			switch (ioe->rs_num) {
 			case PF_RULESET_ETH:
 				if ((error = pf_rollback_eth(ioe->ticket,
 				    ioe->anchor))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 #ifdef ALTQ
 			case PF_RULESET_ALTQ:
 				if (ioe->anchor[0]) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EINVAL;
 					goto fail;
 				}
 				if ((error = pf_rollback_altq(ioe->ticket))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 #endif /* ALTQ */
 			case PF_RULESET_TABLE:
 			    {
 				struct pfr_table table;
 
 				bzero(&table, sizeof(table));
 				strlcpy(table.pfrt_anchor, ioe->anchor,
 				    sizeof(table.pfrt_anchor));
 				if ((error = pfr_ina_rollback(&table,
 				    ioe->ticket, NULL, 0))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 			    }
 			default:
 				if ((error = pf_rollback_rules(ioe->ticket,
 				    ioe->rs_num, ioe->anchor))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 			}
 		}
 		PF_RULES_WUNLOCK();
 		free(ioes, M_TEMP);
 		break;
 	}
 
 	case DIOCXCOMMIT: {
 		struct pfioc_trans	*io = (struct pfioc_trans *)addr;
 		struct pfioc_trans_e	*ioe, *ioes;
 		struct pf_kruleset	*rs;
 		struct pf_keth_ruleset	*ers;
 		size_t			 totlen;
 		int			 i;
 
 		if (io->esize != sizeof(*ioe)) {
 			error = ENODEV;
 			break;
 		}
 
 		if (io->size < 0 ||
 		    io->size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->size, sizeof(struct pfioc_trans_e))) {
 			error = EINVAL;
 			break;
 		}
 
 		totlen = sizeof(struct pfioc_trans_e) * io->size;
 		ioes = mallocarray(io->size, sizeof(struct pfioc_trans_e),
 		    M_TEMP, M_WAITOK);
 		error = copyin(io->array, ioes, totlen);
 		if (error) {
 			free(ioes, M_TEMP);
 			break;
 		}
 		PF_RULES_WLOCK();
 		/* First makes sure everything will succeed. */
 		for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
 			ioe->anchor[sizeof(ioe->anchor) - 1] = 0;
 			switch (ioe->rs_num) {
 			case PF_RULESET_ETH:
 				ers = pf_find_keth_ruleset(ioe->anchor);
 				if (ers == NULL || ioe->ticket == 0 ||
 				    ioe->ticket != ers->inactive.ticket) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EINVAL;
 					goto fail;
 				}
 				break;
 #ifdef ALTQ
 			case PF_RULESET_ALTQ:
 				if (ioe->anchor[0]) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EINVAL;
 					goto fail;
 				}
 				if (!V_altqs_inactive_open || ioe->ticket !=
 				    V_ticket_altqs_inactive) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EBUSY;
 					goto fail;
 				}
 				break;
 #endif /* ALTQ */
 			case PF_RULESET_TABLE:
 				rs = pf_find_kruleset(ioe->anchor);
 				if (rs == NULL || !rs->topen || ioe->ticket !=
 				    rs->tticket) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EBUSY;
 					goto fail;
 				}
 				break;
 			default:
 				if (ioe->rs_num < 0 || ioe->rs_num >=
 				    PF_RULESET_MAX) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EINVAL;
 					goto fail;
 				}
 				rs = pf_find_kruleset(ioe->anchor);
 				if (rs == NULL ||
 				    !rs->rules[ioe->rs_num].inactive.open ||
 				    rs->rules[ioe->rs_num].inactive.ticket !=
 				    ioe->ticket) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					error = EBUSY;
 					goto fail;
 				}
 				break;
 			}
 		}
 		/* Now do the commit - no errors should happen here. */
 		for (i = 0, ioe = ioes; i < io->size; i++, ioe++) {
 			switch (ioe->rs_num) {
 			case PF_RULESET_ETH:
 				if ((error = pf_commit_eth(ioe->ticket, ioe->anchor))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 #ifdef ALTQ
 			case PF_RULESET_ALTQ:
 				if ((error = pf_commit_altq(ioe->ticket))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 #endif /* ALTQ */
 			case PF_RULESET_TABLE:
 			    {
 				struct pfr_table table;
 
 				bzero(&table, sizeof(table));
 				(void)strlcpy(table.pfrt_anchor, ioe->anchor,
 				    sizeof(table.pfrt_anchor));
 				if ((error = pfr_ina_commit(&table,
 				    ioe->ticket, NULL, NULL, 0))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 			    }
 			default:
 				if ((error = pf_commit_rules(ioe->ticket,
 				    ioe->rs_num, ioe->anchor))) {
 					PF_RULES_WUNLOCK();
 					free(ioes, M_TEMP);
 					goto fail; /* really bad */
 				}
 				break;
 			}
 		}
 		PF_RULES_WUNLOCK();
 
 		/* Only hook into EtherNet taffic if we've got rules for it. */
 		if (! TAILQ_EMPTY(V_pf_keth->active.rules))
 			hook_pf_eth();
 		else
 			dehook_pf_eth();
 
 		free(ioes, M_TEMP);
 		break;
 	}
 
 	case DIOCGETSRCNODES: {
 		struct pfioc_src_nodes	*psn = (struct pfioc_src_nodes *)addr;
 		struct pf_srchash	*sh;
 		struct pf_ksrc_node	*n;
 		struct pf_src_node	*p, *pstore;
 		uint32_t		 i, nr = 0;
 
 		for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask;
 				i++, sh++) {
 			PF_HASHROW_LOCK(sh);
 			LIST_FOREACH(n, &sh->nodes, entry)
 				nr++;
 			PF_HASHROW_UNLOCK(sh);
 		}
 
 		psn->psn_len = min(psn->psn_len,
 		    sizeof(struct pf_src_node) * nr);
 
 		if (psn->psn_len == 0) {
 			psn->psn_len = sizeof(struct pf_src_node) * nr;
 			break;
 		}
 
 		nr = 0;
 
 		p = pstore = malloc(psn->psn_len, M_TEMP, M_WAITOK | M_ZERO);
 		for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask;
 		    i++, sh++) {
 		    PF_HASHROW_LOCK(sh);
 		    LIST_FOREACH(n, &sh->nodes, entry) {
 
 			if ((nr + 1) * sizeof(*p) > (unsigned)psn->psn_len)
 				break;
 
 			pf_src_node_copy(n, p);
 
 			p++;
 			nr++;
 		    }
 		    PF_HASHROW_UNLOCK(sh);
 		}
 		error = copyout(pstore, psn->psn_src_nodes,
 		    sizeof(struct pf_src_node) * nr);
 		if (error) {
 			free(pstore, M_TEMP);
 			break;
 		}
 		psn->psn_len = sizeof(struct pf_src_node) * nr;
 		free(pstore, M_TEMP);
 		break;
 	}
 
 	case DIOCCLRSRCNODES: {
 		pf_clear_srcnodes(NULL);
 		pf_purge_expired_src_nodes();
 		break;
 	}
 
 	case DIOCKILLSRCNODES:
 		pf_kill_srcnodes((struct pfioc_src_node_kill *)addr);
 		break;
 
 #ifdef COMPAT_FREEBSD13
 	case DIOCKEEPCOUNTERS_FREEBSD13:
 #endif
 	case DIOCKEEPCOUNTERS:
 		error = pf_keepcounters((struct pfioc_nv *)addr);
 		break;
 
 	case DIOCGETSYNCOOKIES:
 		error = pf_get_syncookies((struct pfioc_nv *)addr);
 		break;
 
 	case DIOCSETSYNCOOKIES:
 		error = pf_set_syncookies((struct pfioc_nv *)addr);
 		break;
 
 	case DIOCSETHOSTID: {
 		u_int32_t	*hostid = (u_int32_t *)addr;
 
 		PF_RULES_WLOCK();
 		if (*hostid == 0)
 			V_pf_status.hostid = arc4random();
 		else
 			V_pf_status.hostid = *hostid;
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCOSFPFLUSH:
 		PF_RULES_WLOCK();
 		pf_osfp_flush();
 		PF_RULES_WUNLOCK();
 		break;
 
 	case DIOCIGETIFACES: {
 		struct pfioc_iface *io = (struct pfioc_iface *)addr;
 		struct pfi_kif *ifstore;
 		size_t bufsiz;
 
 		if (io->pfiio_esize != sizeof(struct pfi_kif)) {
 			error = ENODEV;
 			break;
 		}
 
 		if (io->pfiio_size < 0 ||
 		    io->pfiio_size > pf_ioctl_maxcount ||
 		    WOULD_OVERFLOW(io->pfiio_size, sizeof(struct pfi_kif))) {
 			error = EINVAL;
 			break;
 		}
 
 		io->pfiio_name[sizeof(io->pfiio_name) - 1] = '\0';
 
 		bufsiz = io->pfiio_size * sizeof(struct pfi_kif);
 		ifstore = mallocarray(io->pfiio_size, sizeof(struct pfi_kif),
 		    M_TEMP, M_WAITOK | M_ZERO);
 
 		PF_RULES_RLOCK();
 		pfi_get_ifaces(io->pfiio_name, ifstore, &io->pfiio_size);
 		PF_RULES_RUNLOCK();
 		error = copyout(ifstore, io->pfiio_buffer, bufsiz);
 		free(ifstore, M_TEMP);
 		break;
 	}
 
 	case DIOCSETIFFLAG: {
 		struct pfioc_iface *io = (struct pfioc_iface *)addr;
 
 		io->pfiio_name[sizeof(io->pfiio_name) - 1] = '\0';
 
 		PF_RULES_WLOCK();
 		error = pfi_set_flags(io->pfiio_name, io->pfiio_flags);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	case DIOCCLRIFFLAG: {
 		struct pfioc_iface *io = (struct pfioc_iface *)addr;
 
 		io->pfiio_name[sizeof(io->pfiio_name) - 1] = '\0';
 
 		PF_RULES_WLOCK();
 		error = pfi_clear_flags(io->pfiio_name, io->pfiio_flags);
 		PF_RULES_WUNLOCK();
 		break;
 	}
 
 	default:
 		error = ENODEV;
 		break;
 	}
 fail:
 	if (sx_xlocked(&pf_ioctl_lock))
 		sx_xunlock(&pf_ioctl_lock);
 	CURVNET_RESTORE();
 
 #undef ERROUT_IOCTL
 
 	return (error);
 }
 
 void
 pfsync_state_export(struct pfsync_state *sp, struct pf_kstate *st)
 {
 	bzero(sp, sizeof(struct pfsync_state));
 
 	/* copy from state key */
 	sp->key[PF_SK_WIRE].addr[0] = st->key[PF_SK_WIRE]->addr[0];
 	sp->key[PF_SK_WIRE].addr[1] = st->key[PF_SK_WIRE]->addr[1];
 	sp->key[PF_SK_WIRE].port[0] = st->key[PF_SK_WIRE]->port[0];
 	sp->key[PF_SK_WIRE].port[1] = st->key[PF_SK_WIRE]->port[1];
 	sp->key[PF_SK_STACK].addr[0] = st->key[PF_SK_STACK]->addr[0];
 	sp->key[PF_SK_STACK].addr[1] = st->key[PF_SK_STACK]->addr[1];
 	sp->key[PF_SK_STACK].port[0] = st->key[PF_SK_STACK]->port[0];
 	sp->key[PF_SK_STACK].port[1] = st->key[PF_SK_STACK]->port[1];
 	sp->proto = st->key[PF_SK_WIRE]->proto;
 	sp->af = st->key[PF_SK_WIRE]->af;
 
 	/* copy from state */
 	strlcpy(sp->ifname, st->kif->pfik_name, sizeof(sp->ifname));
 	bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr));
 	sp->creation = htonl(time_uptime - st->creation);
 	sp->expire = pf_state_expires(st);
 	if (sp->expire <= time_uptime)
 		sp->expire = htonl(0);
 	else
 		sp->expire = htonl(sp->expire - time_uptime);
 
 	sp->direction = st->direction;
 	sp->log = st->log;
 	sp->timeout = st->timeout;
 	sp->state_flags = st->state_flags;
 	if (st->src_node)
 		sp->sync_flags |= PFSYNC_FLAG_SRCNODE;
 	if (st->nat_src_node)
 		sp->sync_flags |= PFSYNC_FLAG_NATSRCNODE;
 
 	sp->id = st->id;
 	sp->creatorid = st->creatorid;
 	pf_state_peer_hton(&st->src, &sp->src);
 	pf_state_peer_hton(&st->dst, &sp->dst);
 
 	if (st->rule.ptr == NULL)
 		sp->rule = htonl(-1);
 	else
 		sp->rule = htonl(st->rule.ptr->nr);
 	if (st->anchor.ptr == NULL)
 		sp->anchor = htonl(-1);
 	else
 		sp->anchor = htonl(st->anchor.ptr->nr);
 	if (st->nat_rule.ptr == NULL)
 		sp->nat_rule = htonl(-1);
 	else
 		sp->nat_rule = htonl(st->nat_rule.ptr->nr);
 
 	pf_state_counter_hton(st->packets[0], sp->packets[0]);
 	pf_state_counter_hton(st->packets[1], sp->packets[1]);
 	pf_state_counter_hton(st->bytes[0], sp->bytes[0]);
 	pf_state_counter_hton(st->bytes[1], sp->bytes[1]);
 }
 
 void
 pf_state_export(struct pf_state_export *sp, struct pf_kstate *st)
 {
 	bzero(sp, sizeof(*sp));
 
 	sp->version = PF_STATE_VERSION;
 
 	/* copy from state key */
 	sp->key[PF_SK_WIRE].addr[0] = st->key[PF_SK_WIRE]->addr[0];
 	sp->key[PF_SK_WIRE].addr[1] = st->key[PF_SK_WIRE]->addr[1];
 	sp->key[PF_SK_WIRE].port[0] = st->key[PF_SK_WIRE]->port[0];
 	sp->key[PF_SK_WIRE].port[1] = st->key[PF_SK_WIRE]->port[1];
 	sp->key[PF_SK_STACK].addr[0] = st->key[PF_SK_STACK]->addr[0];
 	sp->key[PF_SK_STACK].addr[1] = st->key[PF_SK_STACK]->addr[1];
 	sp->key[PF_SK_STACK].port[0] = st->key[PF_SK_STACK]->port[0];
 	sp->key[PF_SK_STACK].port[1] = st->key[PF_SK_STACK]->port[1];
 	sp->proto = st->key[PF_SK_WIRE]->proto;
 	sp->af = st->key[PF_SK_WIRE]->af;
 
 	/* copy from state */
 	strlcpy(sp->ifname, st->kif->pfik_name, sizeof(sp->ifname));
 	strlcpy(sp->orig_ifname, st->orig_kif->pfik_name,
 	    sizeof(sp->orig_ifname));
 	bcopy(&st->rt_addr, &sp->rt_addr, sizeof(sp->rt_addr));
 	sp->creation = htonl(time_uptime - st->creation);
 	sp->expire = pf_state_expires(st);
 	if (sp->expire <= time_uptime)
 		sp->expire = htonl(0);
 	else
 		sp->expire = htonl(sp->expire - time_uptime);
 
 	sp->direction = st->direction;
 	sp->log = st->log;
 	sp->timeout = st->timeout;
 	sp->state_flags = st->state_flags;
 	if (st->src_node)
 		sp->sync_flags |= PFSYNC_FLAG_SRCNODE;
 	if (st->nat_src_node)
 		sp->sync_flags |= PFSYNC_FLAG_NATSRCNODE;
 
 	sp->id = st->id;
 	sp->creatorid = st->creatorid;
 	pf_state_peer_hton(&st->src, &sp->src);
 	pf_state_peer_hton(&st->dst, &sp->dst);
 
 	if (st->rule.ptr == NULL)
 		sp->rule = htonl(-1);
 	else
 		sp->rule = htonl(st->rule.ptr->nr);
 	if (st->anchor.ptr == NULL)
 		sp->anchor = htonl(-1);
 	else
 		sp->anchor = htonl(st->anchor.ptr->nr);
 	if (st->nat_rule.ptr == NULL)
 		sp->nat_rule = htonl(-1);
 	else
 		sp->nat_rule = htonl(st->nat_rule.ptr->nr);
 
 	sp->packets[0] = st->packets[0];
 	sp->packets[1] = st->packets[1];
 	sp->bytes[0] = st->bytes[0];
 	sp->bytes[1] = st->bytes[1];
 }
 
 static void
 pf_tbladdr_copyout(struct pf_addr_wrap *aw)
 {
 	struct pfr_ktable *kt;
 
 	KASSERT(aw->type == PF_ADDR_TABLE, ("%s: type %u", __func__, aw->type));
 
 	kt = aw->p.tbl;
 	if (!(kt->pfrkt_flags & PFR_TFLAG_ACTIVE) && kt->pfrkt_root != NULL)
 		kt = kt->pfrkt_root;
 	aw->p.tbl = NULL;
 	aw->p.tblcnt = (kt->pfrkt_flags & PFR_TFLAG_ACTIVE) ?
 		kt->pfrkt_cnt : -1;
 }
 
 static int
 pf_add_status_counters(nvlist_t *nvl, const char *name, counter_u64_t *counters,
     size_t number, char **names)
 {
 	nvlist_t        *nvc;
 
 	nvc = nvlist_create(0);
 	if (nvc == NULL)
 		return (ENOMEM);
 
 	for (int i = 0; i < number; i++) {
 		nvlist_append_number_array(nvc, "counters",
 		    counter_u64_fetch(counters[i]));
 		nvlist_append_string_array(nvc, "names",
 		    names[i]);
 		nvlist_append_number_array(nvc, "ids",
 		    i);
 	}
 	nvlist_add_nvlist(nvl, name, nvc);
 	nvlist_destroy(nvc);
 
 	return (0);
 }
 
 static int
 pf_getstatus(struct pfioc_nv *nv)
 {
 	nvlist_t        *nvl = NULL, *nvc = NULL;
 	void            *nvlpacked = NULL;
 	int              error;
 	struct pf_status s;
 	char *pf_reasons[PFRES_MAX+1] = PFRES_NAMES;
 	char *pf_lcounter[KLCNT_MAX+1] = KLCNT_NAMES;
 	char *pf_fcounter[FCNT_MAX+1] = FCNT_NAMES;
 	PF_RULES_RLOCK_TRACKER;
 
 #define ERROUT(x)      ERROUT_FUNCTION(errout, x)
 
 	PF_RULES_RLOCK();
 
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		ERROUT(ENOMEM);
 
 	nvlist_add_bool(nvl, "running", V_pf_status.running);
 	nvlist_add_number(nvl, "since", V_pf_status.since);
 	nvlist_add_number(nvl, "debug", V_pf_status.debug);
 	nvlist_add_number(nvl, "hostid", V_pf_status.hostid);
 	nvlist_add_number(nvl, "states", V_pf_status.states);
 	nvlist_add_number(nvl, "src_nodes", V_pf_status.src_nodes);
 	nvlist_add_bool(nvl, "syncookies_active",
 	    V_pf_status.syncookies_active);
 
 	/* counters */
 	error = pf_add_status_counters(nvl, "counters", V_pf_status.counters,
 	    PFRES_MAX, pf_reasons);
 	if (error != 0)
 		ERROUT(error);
 
 	/* lcounters */
 	error = pf_add_status_counters(nvl, "lcounters", V_pf_status.lcounters,
 	    KLCNT_MAX, pf_lcounter);
 	if (error != 0)
 		ERROUT(error);
 
 	/* fcounters */
 	nvc = nvlist_create(0);
 	if (nvc == NULL)
 		ERROUT(ENOMEM);
 
 	for (int i = 0; i < FCNT_MAX; i++) {
 		nvlist_append_number_array(nvc, "counters",
 		    pf_counter_u64_fetch(&V_pf_status.fcounters[i]));
 		nvlist_append_string_array(nvc, "names",
 		    pf_fcounter[i]);
 		nvlist_append_number_array(nvc, "ids",
 		    i);
 	}
 	nvlist_add_nvlist(nvl, "fcounters", nvc);
 	nvlist_destroy(nvc);
 	nvc = NULL;
 
 	/* scounters */
 	error = pf_add_status_counters(nvl, "scounters", V_pf_status.scounters,
 	    SCNT_MAX, pf_fcounter);
 	if (error != 0)
 		ERROUT(error);
 
 	nvlist_add_string(nvl, "ifname", V_pf_status.ifname);
 	nvlist_add_binary(nvl, "chksum", V_pf_status.pf_chksum,
 	    PF_MD5_DIGEST_LENGTH);
 
 	pfi_update_status(V_pf_status.ifname, &s);
 
 	/* pcounters / bcounters */
 	for (int i = 0; i < 2; i++) {
 		for (int j = 0; j < 2; j++) {
 			for (int k = 0; k < 2; k++) {
 				nvlist_append_number_array(nvl, "pcounters",
 				    s.pcounters[i][j][k]);
 			}
 			nvlist_append_number_array(nvl, "bcounters",
 			    s.bcounters[i][j]);
 		}
 	}
 
 	nvlpacked = nvlist_pack(nvl, &nv->len);
 	if (nvlpacked == NULL)
 		ERROUT(ENOMEM);
 
 	if (nv->size == 0)
 		ERROUT(0);
 	else if (nv->size < nv->len)
 		ERROUT(ENOSPC);
 
 	PF_RULES_RUNLOCK();
 	error = copyout(nvlpacked, nv->data, nv->len);
 	goto done;
 
 #undef ERROUT
 errout:
 	PF_RULES_RUNLOCK();
 done:
 	free(nvlpacked, M_NVLIST);
 	nvlist_destroy(nvc);
 	nvlist_destroy(nvl);
 
 	return (error);
 }
 
 /*
  * XXX - Check for version mismatch!!!
  */
 static void
 pf_clear_all_states(void)
 {
 	struct pf_kstate	*s;
 	u_int i;
 
 	for (i = 0; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 relock:
 		PF_HASHROW_LOCK(ih);
 		LIST_FOREACH(s, &ih->states, entry) {
 			s->timeout = PFTM_PURGE;
 			/* Don't send out individual delete messages. */
 			s->state_flags |= PFSTATE_NOSYNC;
 			pf_unlink_state(s);
 			goto relock;
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 }
 
 static int
 pf_clear_tables(void)
 {
 	struct pfioc_table io;
 	int error;
 
 	bzero(&io, sizeof(io));
 
 	error = pfr_clr_tables(&io.pfrio_table, &io.pfrio_ndel,
 	    io.pfrio_flags);
 
 	return (error);
 }
 
 static void
 pf_clear_srcnodes(struct pf_ksrc_node *n)
 {
 	struct pf_kstate *s;
 	int i;
 
 	for (i = 0; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 
 		PF_HASHROW_LOCK(ih);
 		LIST_FOREACH(s, &ih->states, entry) {
 			if (n == NULL || n == s->src_node)
 				s->src_node = NULL;
 			if (n == NULL || n == s->nat_src_node)
 				s->nat_src_node = NULL;
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 
 	if (n == NULL) {
 		struct pf_srchash *sh;
 
 		for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask;
 		    i++, sh++) {
 			PF_HASHROW_LOCK(sh);
 			LIST_FOREACH(n, &sh->nodes, entry) {
 				n->expire = 1;
 				n->states = 0;
 			}
 			PF_HASHROW_UNLOCK(sh);
 		}
 	} else {
 		/* XXX: hash slot should already be locked here. */
 		n->expire = 1;
 		n->states = 0;
 	}
 }
 
 static void
 pf_kill_srcnodes(struct pfioc_src_node_kill *psnk)
 {
 	struct pf_ksrc_node_list	 kill;
 
 	LIST_INIT(&kill);
 	for (int i = 0; i <= pf_srchashmask; i++) {
 		struct pf_srchash *sh = &V_pf_srchash[i];
 		struct pf_ksrc_node *sn, *tmp;
 
 		PF_HASHROW_LOCK(sh);
 		LIST_FOREACH_SAFE(sn, &sh->nodes, entry, tmp)
 			if (PF_MATCHA(psnk->psnk_src.neg,
 			      &psnk->psnk_src.addr.v.a.addr,
 			      &psnk->psnk_src.addr.v.a.mask,
 			      &sn->addr, sn->af) &&
 			    PF_MATCHA(psnk->psnk_dst.neg,
 			      &psnk->psnk_dst.addr.v.a.addr,
 			      &psnk->psnk_dst.addr.v.a.mask,
 			      &sn->raddr, sn->af)) {
 				pf_unlink_src_node(sn);
 				LIST_INSERT_HEAD(&kill, sn, entry);
 				sn->expire = 1;
 			}
 		PF_HASHROW_UNLOCK(sh);
 	}
 
 	for (int i = 0; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 		struct pf_kstate *s;
 
 		PF_HASHROW_LOCK(ih);
 		LIST_FOREACH(s, &ih->states, entry) {
 			if (s->src_node && s->src_node->expire == 1)
 				s->src_node = NULL;
 			if (s->nat_src_node && s->nat_src_node->expire == 1)
 				s->nat_src_node = NULL;
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 
 	psnk->psnk_killed = pf_free_src_nodes(&kill);
 }
 
 static int
 pf_keepcounters(struct pfioc_nv *nv)
 {
 	nvlist_t	*nvl = NULL;
 	void		*nvlpacked = NULL;
 	int		 error = 0;
 
 #define	ERROUT(x)	ERROUT_FUNCTION(on_error, x)
 
 	if (nv->len > pf_ioctl_maxcount)
 		ERROUT(ENOMEM);
 
 	nvlpacked = malloc(nv->len, M_NVLIST, M_WAITOK);
 	if (nvlpacked == NULL)
 		ERROUT(ENOMEM);
 
 	error = copyin(nv->data, nvlpacked, nv->len);
 	if (error)
 		ERROUT(error);
 
 	nvl = nvlist_unpack(nvlpacked, nv->len, 0);
 	if (nvl == NULL)
 		ERROUT(EBADMSG);
 
 	if (! nvlist_exists_bool(nvl, "keep_counters"))
 		ERROUT(EBADMSG);
 
 	V_pf_status.keep_counters = nvlist_get_bool(nvl, "keep_counters");
 
 on_error:
 	nvlist_destroy(nvl);
 	free(nvlpacked, M_NVLIST);
 	return (error);
 }
 
 static unsigned int
 pf_clear_states(const struct pf_kstate_kill *kill)
 {
 	struct pf_state_key_cmp	 match_key;
 	struct pf_kstate	*s;
 	struct pfi_kkif	*kif;
 	int		 idx;
 	unsigned int	 killed = 0, dir;
 
 	for (unsigned int i = 0; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 
 relock_DIOCCLRSTATES:
 		PF_HASHROW_LOCK(ih);
 		LIST_FOREACH(s, &ih->states, entry) {
 			/* For floating states look at the original kif. */
 			kif = s->kif == V_pfi_all ? s->orig_kif : s->kif;
 
 			if (kill->psk_ifname[0] &&
 			    strcmp(kill->psk_ifname,
 			    kif->pfik_name))
 				continue;
 
 			if (kill->psk_kill_match) {
 				bzero(&match_key, sizeof(match_key));
 
 				if (s->direction == PF_OUT) {
 					dir = PF_IN;
 					idx = PF_SK_STACK;
 				} else {
 					dir = PF_OUT;
 					idx = PF_SK_WIRE;
 				}
 
 				match_key.af = s->key[idx]->af;
 				match_key.proto = s->key[idx]->proto;
 				PF_ACPY(&match_key.addr[0],
 				    &s->key[idx]->addr[1], match_key.af);
 				match_key.port[0] = s->key[idx]->port[1];
 				PF_ACPY(&match_key.addr[1],
 				    &s->key[idx]->addr[0], match_key.af);
 				match_key.port[1] = s->key[idx]->port[0];
 			}
 
 			/*
 			 * Don't send out individual
 			 * delete messages.
 			 */
 			s->state_flags |= PFSTATE_NOSYNC;
 			pf_unlink_state(s);
 			killed++;
 
 			if (kill->psk_kill_match)
 				killed += pf_kill_matching_state(&match_key,
 				    dir);
 
 			goto relock_DIOCCLRSTATES;
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 
 	if (V_pfsync_clear_states_ptr != NULL)
 		V_pfsync_clear_states_ptr(V_pf_status.hostid, kill->psk_ifname);
 
 	return (killed);
 }
 
 static void
 pf_killstates(struct pf_kstate_kill *kill, unsigned int *killed)
 {
 	struct pf_kstate	*s;
 
 	if (kill->psk_pfcmp.id) {
 		if (kill->psk_pfcmp.creatorid == 0)
 			kill->psk_pfcmp.creatorid = V_pf_status.hostid;
 		if ((s = pf_find_state_byid(kill->psk_pfcmp.id,
 		    kill->psk_pfcmp.creatorid))) {
 			pf_unlink_state(s);
 			*killed = 1;
 		}
 		return;
 	}
 
 	for (unsigned int i = 0; i <= pf_hashmask; i++)
 		*killed += pf_killstates_row(kill, &V_pf_idhash[i]);
 
 	return;
 }
 
 static int
 pf_killstates_nv(struct pfioc_nv *nv)
 {
 	struct pf_kstate_kill	 kill;
 	nvlist_t		*nvl = NULL;
 	void			*nvlpacked = NULL;
 	int			 error = 0;
 	unsigned int		 killed = 0;
 
 #define ERROUT(x)	ERROUT_FUNCTION(on_error, x)
 
 	if (nv->len > pf_ioctl_maxcount)
 		ERROUT(ENOMEM);
 
 	nvlpacked = malloc(nv->len, M_NVLIST, M_WAITOK);
 	if (nvlpacked == NULL)
 		ERROUT(ENOMEM);
 
 	error = copyin(nv->data, nvlpacked, nv->len);
 	if (error)
 		ERROUT(error);
 
 	nvl = nvlist_unpack(nvlpacked, nv->len, 0);
 	if (nvl == NULL)
 		ERROUT(EBADMSG);
 
 	error = pf_nvstate_kill_to_kstate_kill(nvl, &kill);
 	if (error)
 		ERROUT(error);
 
 	pf_killstates(&kill, &killed);
 
 	free(nvlpacked, M_NVLIST);
 	nvlpacked = NULL;
 	nvlist_destroy(nvl);
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		ERROUT(ENOMEM);
 
 	nvlist_add_number(nvl, "killed", killed);
 
 	nvlpacked = nvlist_pack(nvl, &nv->len);
 	if (nvlpacked == NULL)
 		ERROUT(ENOMEM);
 
 	if (nv->size == 0)
 		ERROUT(0);
 	else if (nv->size < nv->len)
 		ERROUT(ENOSPC);
 
 	error = copyout(nvlpacked, nv->data, nv->len);
 
 on_error:
 	nvlist_destroy(nvl);
 	free(nvlpacked, M_NVLIST);
 	return (error);
 }
 
 static int
 pf_clearstates_nv(struct pfioc_nv *nv)
 {
 	struct pf_kstate_kill	 kill;
 	nvlist_t		*nvl = NULL;
 	void			*nvlpacked = NULL;
 	int			 error = 0;
 	unsigned int		 killed;
 
 #define ERROUT(x)	ERROUT_FUNCTION(on_error, x)
 
 	if (nv->len > pf_ioctl_maxcount)
 		ERROUT(ENOMEM);
 
 	nvlpacked = malloc(nv->len, M_NVLIST, M_WAITOK);
 	if (nvlpacked == NULL)
 		ERROUT(ENOMEM);
 
 	error = copyin(nv->data, nvlpacked, nv->len);
 	if (error)
 		ERROUT(error);
 
 	nvl = nvlist_unpack(nvlpacked, nv->len, 0);
 	if (nvl == NULL)
 		ERROUT(EBADMSG);
 
 	error = pf_nvstate_kill_to_kstate_kill(nvl, &kill);
 	if (error)
 		ERROUT(error);
 
 	killed = pf_clear_states(&kill);
 
 	free(nvlpacked, M_NVLIST);
 	nvlpacked = NULL;
 	nvlist_destroy(nvl);
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		ERROUT(ENOMEM);
 
 	nvlist_add_number(nvl, "killed", killed);
 
 	nvlpacked = nvlist_pack(nvl, &nv->len);
 	if (nvlpacked == NULL)
 		ERROUT(ENOMEM);
 
 	if (nv->size == 0)
 		ERROUT(0);
 	else if (nv->size < nv->len)
 		ERROUT(ENOSPC);
 
 	error = copyout(nvlpacked, nv->data, nv->len);
 
 #undef ERROUT
 on_error:
 	nvlist_destroy(nvl);
 	free(nvlpacked, M_NVLIST);
 	return (error);
 }
 
 static int
 pf_getstate(struct pfioc_nv *nv)
 {
 	nvlist_t		*nvl = NULL, *nvls;
 	void			*nvlpacked = NULL;
 	struct pf_kstate	*s = NULL;
 	int			 error = 0;
 	uint64_t		 id, creatorid;
 
 #define ERROUT(x)	ERROUT_FUNCTION(errout, x)
 
 	if (nv->len > pf_ioctl_maxcount)
 		ERROUT(ENOMEM);
 
 	nvlpacked = malloc(nv->len, M_NVLIST, M_WAITOK);
 	if (nvlpacked == NULL)
 		ERROUT(ENOMEM);
 
 	error = copyin(nv->data, nvlpacked, nv->len);
 	if (error)
 		ERROUT(error);
 
 	nvl = nvlist_unpack(nvlpacked, nv->len, 0);
 	if (nvl == NULL)
 		ERROUT(EBADMSG);
 
 	PFNV_CHK(pf_nvuint64(nvl, "id", &id));
 	PFNV_CHK(pf_nvuint64(nvl, "creatorid", &creatorid));
 
 	s = pf_find_state_byid(id, creatorid);
 	if (s == NULL)
 		ERROUT(ENOENT);
 
 	free(nvlpacked, M_NVLIST);
 	nvlpacked = NULL;
 	nvlist_destroy(nvl);
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		ERROUT(ENOMEM);
 
 	nvls = pf_state_to_nvstate(s);
 	if (nvls == NULL)
 		ERROUT(ENOMEM);
 
 	nvlist_add_nvlist(nvl, "state", nvls);
 	nvlist_destroy(nvls);
 
 	nvlpacked = nvlist_pack(nvl, &nv->len);
 	if (nvlpacked == NULL)
 		ERROUT(ENOMEM);
 
 	if (nv->size == 0)
 		ERROUT(0);
 	else if (nv->size < nv->len)
 		ERROUT(ENOSPC);
 
 	error = copyout(nvlpacked, nv->data, nv->len);
 
 #undef ERROUT
 errout:
 	if (s != NULL)
 		PF_STATE_UNLOCK(s);
 	free(nvlpacked, M_NVLIST);
 	nvlist_destroy(nvl);
 	return (error);
 }
 
 /*
  * XXX - Check for version mismatch!!!
  */
 
 /*
  * Duplicate pfctl -Fa operation to get rid of as much as we can.
  */
 static int
 shutdown_pf(void)
 {
 	int error = 0;
 	u_int32_t t[5];
 	char nn = '\0';
 
 	do {
 		if ((error = pf_begin_rules(&t[0], PF_RULESET_SCRUB, &nn))
 		    != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: SCRUB\n"));
 			break;
 		}
 		if ((error = pf_begin_rules(&t[1], PF_RULESET_FILTER, &nn))
 		    != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: FILTER\n"));
 			break;		/* XXX: rollback? */
 		}
 		if ((error = pf_begin_rules(&t[2], PF_RULESET_NAT, &nn))
 		    != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: NAT\n"));
 			break;		/* XXX: rollback? */
 		}
 		if ((error = pf_begin_rules(&t[3], PF_RULESET_BINAT, &nn))
 		    != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: BINAT\n"));
 			break;		/* XXX: rollback? */
 		}
 		if ((error = pf_begin_rules(&t[4], PF_RULESET_RDR, &nn))
 		    != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: RDR\n"));
 			break;		/* XXX: rollback? */
 		}
 
 		/* XXX: these should always succeed here */
 		pf_commit_rules(t[0], PF_RULESET_SCRUB, &nn);
 		pf_commit_rules(t[1], PF_RULESET_FILTER, &nn);
 		pf_commit_rules(t[2], PF_RULESET_NAT, &nn);
 		pf_commit_rules(t[3], PF_RULESET_BINAT, &nn);
 		pf_commit_rules(t[4], PF_RULESET_RDR, &nn);
 
 		if ((error = pf_clear_tables()) != 0)
 			break;
 
 		if ((error = pf_begin_eth(&t[0], &nn)) != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: eth\n"));
 			break;
 		}
 		pf_commit_eth(t[0], &nn);
 
 #ifdef ALTQ
 		if ((error = pf_begin_altq(&t[0])) != 0) {
 			DPFPRINTF(PF_DEBUG_MISC, ("shutdown_pf: ALTQ\n"));
 			break;
 		}
 		pf_commit_altq(t[0]);
 #endif
 
 		pf_clear_all_states();
 
 		pf_clear_srcnodes(NULL);
 
 		/* status does not use malloced mem so no need to cleanup */
 		/* fingerprints and interfaces have their own cleanup code */
 	} while(0);
 
 	return (error);
 }
 
 static pfil_return_t
 pf_check_return(int chk, struct mbuf **m)
 {
 
 	switch (chk) {
 	case PF_PASS:
 		if (*m == NULL)
 			return (PFIL_CONSUMED);
 		else
 			return (PFIL_PASS);
 		break;
 	default:
 		if (*m != NULL) {
 			m_freem(*m);
 			*m = NULL;
 		}
 		return (PFIL_DROPPED);
 	}
 }
 
 static pfil_return_t
 pf_eth_check_in(struct mbuf **m, struct ifnet *ifp, int flags,
     void *ruleset __unused, struct inpcb *inp)
 {
 	int chk;
 
 	chk = pf_test_eth(PF_IN, flags, ifp, m, inp);
 
 	return (pf_check_return(chk, m));
 }
 
 static pfil_return_t
 pf_eth_check_out(struct mbuf **m, struct ifnet *ifp, int flags,
     void *ruleset __unused, struct inpcb *inp)
 {
 	int chk;
 
 	chk = pf_test_eth(PF_OUT, flags, ifp, m, inp);
 
 	return (pf_check_return(chk, m));
 }
 
 #ifdef INET
 static pfil_return_t
 pf_check_in(struct mbuf **m, struct ifnet *ifp, int flags,
     void *ruleset __unused, struct inpcb *inp)
 {
 	int chk;
 
 	chk = pf_test(PF_IN, flags, ifp, m, inp);
 
 	return (pf_check_return(chk, m));
 }
 
 static pfil_return_t
 pf_check_out(struct mbuf **m, struct ifnet *ifp, int flags,
     void *ruleset __unused,  struct inpcb *inp)
 {
 	int chk;
 
 	chk = pf_test(PF_OUT, flags, ifp, m, inp);
 
 	return (pf_check_return(chk, m));
 }
 #endif
 
 #ifdef INET6
 static pfil_return_t
 pf_check6_in(struct mbuf **m, struct ifnet *ifp, int flags,
     void *ruleset __unused,  struct inpcb *inp)
 {
 	int chk;
 
 	/*
 	 * In case of loopback traffic IPv6 uses the real interface in
 	 * order to support scoped addresses. In order to support stateful
 	 * filtering we have change this to lo0 as it is the case in IPv4.
 	 */
 	CURVNET_SET(ifp->if_vnet);
 	chk = pf_test6(PF_IN, flags, (*m)->m_flags & M_LOOP ? V_loif : ifp, m, inp);
 	CURVNET_RESTORE();
 
 	return (pf_check_return(chk, m));
 }
 
 static pfil_return_t
 pf_check6_out(struct mbuf **m, struct ifnet *ifp, int flags,
     void *ruleset __unused,  struct inpcb *inp)
 {
 	int chk;
 
 	CURVNET_SET(ifp->if_vnet);
 	chk = pf_test6(PF_OUT, flags, ifp, m, inp);
 	CURVNET_RESTORE();
 
 	return (pf_check_return(chk, m));
 }
 #endif /* INET6 */
 
 VNET_DEFINE_STATIC(pfil_hook_t, pf_eth_in_hook);
 VNET_DEFINE_STATIC(pfil_hook_t, pf_eth_out_hook);
 #define	V_pf_eth_in_hook	VNET(pf_eth_in_hook)
 #define	V_pf_eth_out_hook	VNET(pf_eth_out_hook)
 
 #ifdef INET
 VNET_DEFINE_STATIC(pfil_hook_t, pf_ip4_in_hook);
 VNET_DEFINE_STATIC(pfil_hook_t, pf_ip4_out_hook);
 #define	V_pf_ip4_in_hook	VNET(pf_ip4_in_hook)
 #define	V_pf_ip4_out_hook	VNET(pf_ip4_out_hook)
 #endif
 #ifdef INET6
 VNET_DEFINE_STATIC(pfil_hook_t, pf_ip6_in_hook);
 VNET_DEFINE_STATIC(pfil_hook_t, pf_ip6_out_hook);
 #define	V_pf_ip6_in_hook	VNET(pf_ip6_in_hook)
 #define	V_pf_ip6_out_hook	VNET(pf_ip6_out_hook)
 #endif
 
 static void
 hook_pf_eth(void)
 {
 	struct pfil_hook_args pha;
 	struct pfil_link_args pla;
 	int ret __diagused;
 
 	if (atomic_load_bool(&V_pf_pfil_eth_hooked))
 		return;
 
 	pha.pa_version = PFIL_VERSION;
 	pha.pa_modname = "pf";
 	pha.pa_ruleset = NULL;
 
 	pla.pa_version = PFIL_VERSION;
 
 	pha.pa_type = PFIL_TYPE_ETHERNET;
 	pha.pa_func = pf_eth_check_in;
 	pha.pa_flags = PFIL_IN;
 	pha.pa_rulname = "eth-in";
 	V_pf_eth_in_hook = pfil_add_hook(&pha);
 	pla.pa_flags = PFIL_IN | PFIL_HEADPTR | PFIL_HOOKPTR;
 	pla.pa_head = V_link_pfil_head;
 	pla.pa_hook = V_pf_eth_in_hook;
 	ret = pfil_link(&pla);
 	MPASS(ret == 0);
 	pha.pa_func = pf_eth_check_out;
 	pha.pa_flags = PFIL_OUT;
 	pha.pa_rulname = "eth-out";
 	V_pf_eth_out_hook = pfil_add_hook(&pha);
 	pla.pa_flags = PFIL_OUT | PFIL_HEADPTR | PFIL_HOOKPTR;
 	pla.pa_head = V_link_pfil_head;
 	pla.pa_hook = V_pf_eth_out_hook;
 	ret = pfil_link(&pla);
 	MPASS(ret == 0);
 
 	atomic_store_bool(&V_pf_pfil_eth_hooked, true);
 }
 
 static void
 hook_pf(void)
 {
 	struct pfil_hook_args pha;
 	struct pfil_link_args pla;
 	int ret __diagused;
 
 	if (atomic_load_bool(&V_pf_pfil_hooked))
 		return;
 
 	pha.pa_version = PFIL_VERSION;
 	pha.pa_modname = "pf";
 	pha.pa_ruleset = NULL;
 
 	pla.pa_version = PFIL_VERSION;
 
 #ifdef INET
 	pha.pa_type = PFIL_TYPE_IP4;
 	pha.pa_func = pf_check_in;
 	pha.pa_flags = PFIL_IN;
 	pha.pa_rulname = "default-in";
 	V_pf_ip4_in_hook = pfil_add_hook(&pha);
 	pla.pa_flags = PFIL_IN | PFIL_HEADPTR | PFIL_HOOKPTR;
 	pla.pa_head = V_inet_pfil_head;
 	pla.pa_hook = V_pf_ip4_in_hook;
 	ret = pfil_link(&pla);
 	MPASS(ret == 0);
 	pha.pa_func = pf_check_out;
 	pha.pa_flags = PFIL_OUT;
 	pha.pa_rulname = "default-out";
 	V_pf_ip4_out_hook = pfil_add_hook(&pha);
 	pla.pa_flags = PFIL_OUT | PFIL_HEADPTR | PFIL_HOOKPTR;
 	pla.pa_head = V_inet_pfil_head;
 	pla.pa_hook = V_pf_ip4_out_hook;
 	ret = pfil_link(&pla);
 	MPASS(ret == 0);
 #endif
 #ifdef INET6
 	pha.pa_type = PFIL_TYPE_IP6;
 	pha.pa_func = pf_check6_in;
 	pha.pa_flags = PFIL_IN;
 	pha.pa_rulname = "default-in6";
 	V_pf_ip6_in_hook = pfil_add_hook(&pha);
 	pla.pa_flags = PFIL_IN | PFIL_HEADPTR | PFIL_HOOKPTR;
 	pla.pa_head = V_inet6_pfil_head;
 	pla.pa_hook = V_pf_ip6_in_hook;
 	ret = pfil_link(&pla);
 	MPASS(ret == 0);
 	pha.pa_func = pf_check6_out;
 	pha.pa_rulname = "default-out6";
 	pha.pa_flags = PFIL_OUT;
 	V_pf_ip6_out_hook = pfil_add_hook(&pha);
 	pla.pa_flags = PFIL_OUT | PFIL_HEADPTR | PFIL_HOOKPTR;
 	pla.pa_head = V_inet6_pfil_head;
 	pla.pa_hook = V_pf_ip6_out_hook;
 	ret = pfil_link(&pla);
 	MPASS(ret == 0);
 #endif
 
 	atomic_store_bool(&V_pf_pfil_hooked, true);
 }
 
 static void
 dehook_pf_eth(void)
 {
 
 	if (!atomic_load_bool(&V_pf_pfil_eth_hooked))
 		return;
 
 	pfil_remove_hook(V_pf_eth_in_hook);
 	pfil_remove_hook(V_pf_eth_out_hook);
 
 	atomic_store_bool(&V_pf_pfil_eth_hooked, false);
 }
 
 static void
 dehook_pf(void)
 {
 
 	if (!atomic_load_bool(&V_pf_pfil_hooked))
 		return;
 
 #ifdef INET
 	pfil_remove_hook(V_pf_ip4_in_hook);
 	pfil_remove_hook(V_pf_ip4_out_hook);
 #endif
 #ifdef INET6
 	pfil_remove_hook(V_pf_ip6_in_hook);
 	pfil_remove_hook(V_pf_ip6_out_hook);
 #endif
 
 	atomic_store_bool(&V_pf_pfil_hooked, false);
 }
 
 static void
 pf_load_vnet(void)
 {
 	V_pf_tag_z = uma_zcreate("pf tags", sizeof(struct pf_tagname),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 
 	pf_init_tagset(&V_pf_tags, &pf_rule_tag_hashsize,
 	    PF_RULE_TAG_HASH_SIZE_DEFAULT);
 #ifdef ALTQ
 	pf_init_tagset(&V_pf_qids, &pf_queue_tag_hashsize,
 	    PF_QUEUE_TAG_HASH_SIZE_DEFAULT);
 #endif
 
 	V_pf_keth = &V_pf_main_keth_anchor.ruleset;
 
 	pfattach_vnet();
 	V_pf_vnet_active = 1;
 }
 
 static int
 pf_load(void)
 {
 	int error;
 
 	rm_init_flags(&pf_rules_lock, "pf rulesets", RM_RECURSE);
 	sx_init(&pf_ioctl_lock, "pf ioctl");
 	sx_init(&pf_end_lock, "pf end thread");
 
 	pf_mtag_initialize();
 
 	pf_dev = make_dev(&pf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, PF_NAME);
 	if (pf_dev == NULL)
 		return (ENOMEM);
 
 	pf_end_threads = 0;
 	error = kproc_create(pf_purge_thread, NULL, &pf_purge_proc, 0, 0, "pf purge");
 	if (error != 0)
 		return (error);
 
 	pfi_initialize();
 
 	return (0);
 }
 
 static void
 pf_unload_vnet(void)
 {
 	int ret __diagused;
 
 	V_pf_vnet_active = 0;
 	V_pf_status.running = 0;
 	dehook_pf();
 	dehook_pf_eth();
 
 	PF_RULES_WLOCK();
 	pf_syncookies_cleanup();
 	shutdown_pf();
 	PF_RULES_WUNLOCK();
 
 	/* Make sure we've cleaned up ethernet rules before we continue. */
 	NET_EPOCH_DRAIN_CALLBACKS();
 
 	ret = swi_remove(V_pf_swi_cookie);
 	MPASS(ret == 0);
 	ret = intr_event_destroy(V_pf_swi_ie);
 	MPASS(ret == 0);
 
 	pf_unload_vnet_purge();
 
 	pf_normalize_cleanup();
 	PF_RULES_WLOCK();
 	pfi_cleanup_vnet();
 	PF_RULES_WUNLOCK();
 	pfr_cleanup();
 	pf_osfp_flush();
 	pf_cleanup();
 	if (IS_DEFAULT_VNET(curvnet))
 		pf_mtag_cleanup();
 
 	pf_cleanup_tagset(&V_pf_tags);
 #ifdef ALTQ
 	pf_cleanup_tagset(&V_pf_qids);
 #endif
 	uma_zdestroy(V_pf_tag_z);
 
 #ifdef PF_WANT_32_TO_64_COUNTER
 	PF_RULES_WLOCK();
 	LIST_REMOVE(V_pf_kifmarker, pfik_allkiflist);
 
 	MPASS(LIST_EMPTY(&V_pf_allkiflist));
 	MPASS(V_pf_allkifcount == 0);
 
 	LIST_REMOVE(&V_pf_default_rule, allrulelist);
 	V_pf_allrulecount--;
 	LIST_REMOVE(V_pf_rulemarker, allrulelist);
 
 	/*
 	 * There are known pf rule leaks when running the test suite.
 	 */
 #ifdef notyet
 	MPASS(LIST_EMPTY(&V_pf_allrulelist));
 	MPASS(V_pf_allrulecount == 0);
 #endif
 
 	PF_RULES_WUNLOCK();
 
 	free(V_pf_kifmarker, PFI_MTYPE);
 	free(V_pf_rulemarker, M_PFRULE);
 #endif
 
 	/* Free counters last as we updated them during shutdown. */
 	pf_counter_u64_deinit(&V_pf_default_rule.evaluations);
 	for (int i = 0; i < 2; i++) {
 		pf_counter_u64_deinit(&V_pf_default_rule.packets[i]);
 		pf_counter_u64_deinit(&V_pf_default_rule.bytes[i]);
 	}
 	counter_u64_free(V_pf_default_rule.states_cur);
 	counter_u64_free(V_pf_default_rule.states_tot);
 	counter_u64_free(V_pf_default_rule.src_nodes);
 	uma_zfree_pcpu(pf_timestamp_pcpu_zone, V_pf_default_rule.timestamp);
 
 	for (int i = 0; i < PFRES_MAX; i++)
 		counter_u64_free(V_pf_status.counters[i]);
 	for (int i = 0; i < KLCNT_MAX; i++)
 		counter_u64_free(V_pf_status.lcounters[i]);
 	for (int i = 0; i < FCNT_MAX; i++)
 		pf_counter_u64_deinit(&V_pf_status.fcounters[i]);
 	for (int i = 0; i < SCNT_MAX; i++)
 		counter_u64_free(V_pf_status.scounters[i]);
 }
 
 static void
 pf_unload(void)
 {
 
 	sx_xlock(&pf_end_lock);
 	pf_end_threads = 1;
 	while (pf_end_threads < 2) {
 		wakeup_one(pf_purge_thread);
 		sx_sleep(pf_purge_proc, &pf_end_lock, 0, "pftmo", 0);
 	}
 	sx_xunlock(&pf_end_lock);
 
 	if (pf_dev != NULL)
 		destroy_dev(pf_dev);
 
 	pfi_cleanup();
 
 	rm_destroy(&pf_rules_lock);
 	sx_destroy(&pf_ioctl_lock);
 	sx_destroy(&pf_end_lock);
 }
 
 static void
 vnet_pf_init(void *unused __unused)
 {
 
 	pf_load_vnet();
 }
 VNET_SYSINIT(vnet_pf_init, SI_SUB_PROTO_FIREWALL, SI_ORDER_THIRD, 
     vnet_pf_init, NULL);
 
 static void
 vnet_pf_uninit(const void *unused __unused)
 {
 
 	pf_unload_vnet();
 } 
 SYSUNINIT(pf_unload, SI_SUB_PROTO_FIREWALL, SI_ORDER_SECOND, pf_unload, NULL);
 VNET_SYSUNINIT(vnet_pf_uninit, SI_SUB_PROTO_FIREWALL, SI_ORDER_THIRD,
     vnet_pf_uninit, NULL);
 
 static int
 pf_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch(type) {
 	case MOD_LOAD:
 		error = pf_load();
 		break;
 	case MOD_UNLOAD:
 		/* Handled in SYSUNINIT(pf_unload) to ensure it's done after
 		 * the vnet_pf_uninit()s */
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
 
 static moduledata_t pf_mod = {
 	"pf",
 	pf_modevent,
 	0
 };
 
 DECLARE_MODULE(pf, pf_mod, SI_SUB_PROTO_FIREWALL, SI_ORDER_SECOND);
 MODULE_VERSION(pf, PF_MODVER);